A Priori Implementation

Date 26 janvier 2018 Catégories Developpement par VulgaireDev Edit on Github

In this notebook, we will implement the algorithm of the Apriori algorithm as described in "Fast algorithms for mining association rules", Rakesh Agrawal, Ramakrishnan Srikant.

First we will try a dummy implementation, then we will try to use correct data-structures.

Naive approach

In [1]:
import pandas as pd
import numpy as np
import time

from itertools import combinations

df = pd.read_csv('./data.csv')
df = df.drop('1', 1)
df.head(10)
Out[1]:
0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 ... 0.38 0.39 0.40 0.41 0.42 0.43 0.44 0.45 0.46 0.47
0 0 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 1 0 0 0 0
1 0 0 0 1 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
2 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
3 0 0 0 0 0 0 1 0 0 0 ... 0 0 1 0 0 0 0 0 0 0
4 0 0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 0 0
6 0 0 1 1 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 1 1 0 0 0 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 1

10 rows × 50 columns

In [2]:
MIN_SUPP = 0.05
MIN_CONF = 0.05
In [3]:
def is_frequent(itemset, min_supp):
    count = 0
    for _, row in df.iterrows():
        is_line_valid = True
        for item in itemset:
            if row[item] == 0:
                is_line_valid = False
        
        if is_line_valid:
            count += 1
    frequency = count/df["0"].count()
    
    if frequency >= min_supp:
        return True
    
    return False
In [4]:
def itemset_1_frequent(df, min_supp):
    sum_column = df.sum()
    result = []
    
    for index, count in sum_column.items():
        if count/df["0"].count() > min_supp:
            result.append(set([index]))
            
    return result

itemsets = {} # dict of lists of sets
itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
print(itemsets)
{1: [{'0'}, {'0.1'}, {'0.2'}, {'0.3'}, {'0.4'}, {'0.5'}, {'0.7'}, {'0.9'}, {'1.1'}, {'0.11'}, {'0.13'}, {'0.14'}, {'0.15'}, {'0.16'}, {'0.17'}, {'0.18'}, {'0.20'}, {'0.21'}, {'0.22'}, {'0.25'}, {'0.26'}, {'0.27'}, {'0.29'}, {'0.30'}, {'0.31'}, {'0.33'}, {'0.34'}, {'0.35'}, {'0.38'}, {'0.39'}, {'0.40'}, {'0.41'}, {'0.42'}, {'0.43'}, {'0.44'}, {'0.45'}, {'0.46'}, {'0.47'}]}
In [5]:
def apriori_gen(itemsets):
    # simple approach, not the one on the paper
    # it works but does not seem as optimized as the paper approach
    items = set()
    for x_itemset in itemsets:
        for item in x_itemset:
            items.add(item)
    
    C_k = set()
    
    for x_itemset in itemsets:
        for item in items:  
            if item not in x_itemset:
                C_k.add(frozenset([*x_itemset, item]))          
    
    elt_to_remove = set()
    
    for x_itemset in C_k:        
        for elt in x_itemset:
            subset = set([*x_itemset])
            subset.remove(elt)
            if subset not in itemsets:
                elt_to_remove.add(x_itemset)
        
    for remove in elt_to_remove:
        C_k.remove(remove)
   
    return C_k
            
            
apriori_gen([{'1', '2', '3'}, {'1', '2', '4'}, {'1', '3', '4'}, {'1', '3', '5'}, {'2', '3', '4'}])
Out[5]:
{frozenset({'1', '2', '3', '4'})}

In aPriori, the are three ways of stating that an itemset is infrequent. It could be because:

  • it is not generated (in the first part of apriori_gen)
  • it is pruned (second part of apriori_gen, based on the fact that if the itemset {beer} is non-frequent, the itemset {beer, milk} is non-frequent too)
  • it does not have minimum support (we count in data)

With the approach here, the number of generated itemsets is greater than the one on the paper, leading to a diminution of performances.

In [6]:
def subset(candidates, row):
    """ Filter all candidates itemset wich are present in the row
    """
    # dummy solution
    subset = set()
    
    for candidate in candidates:
        add = True
        for elt in candidate:
            if not row[elt]:
                add = False
        
        if add:
            subset.add(frozenset(candidate))
            
    return subset

candidates = [{'0.7', '0.35'}, {'0', '0.1'}]
row = next(df.iterrows())[1]
print(subset(candidates, row))
{frozenset({'0.7', '0.35'})}
In [7]:
def aPriori():
    itemsets = {} # dict of lists of sets
    itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
    last_itemset = itemsets[1] 
    itemset_size = 1

    while(last_itemset):
        itemset_size += 1

        candidates = apriori_gen(last_itemset)
        candidates_count = {}

        for index, row in df.iterrows():
            # print(row)
            local_candidates = subset(candidates, row)
            for local_candidate in local_candidates:
                candidates_count[id(local_candidate)] = candidates_count.get(id(local_candidate), 0) + 1


        # check minimum support
        for candidate in candidates:
            if candidates_count[id(candidate)]/df["0"].count() > MIN_SUPP:
                itemsets.setdefault(itemset_size, []).append(candidate)

        last_itemset = itemsets.get(itemset_size, 0)

    return itemsets

print(aPriori())

# let's print the time it takes with this approach

start_time = time.time()
aPriori()
print("APriori took {} seconds".format(time.time() - start_time))
{1: [{'0'}, {'0.1'}, {'0.2'}, {'0.3'}, {'0.4'}, {'0.5'}, {'0.7'}, {'0.9'}, {'1.1'}, {'0.11'}, {'0.13'}, {'0.14'}, {'0.15'}, {'0.16'}, {'0.17'}, {'0.18'}, {'0.20'}, {'0.21'}, {'0.22'}, {'0.25'}, {'0.26'}, {'0.27'}, {'0.29'}, {'0.30'}, {'0.31'}, {'0.33'}, {'0.34'}, {'0.35'}, {'0.38'}, {'0.39'}, {'0.40'}, {'0.41'}, {'0.42'}, {'0.43'}, {'0.44'}, {'0.45'}, {'0.46'}, {'0.47'}], 2: [frozenset({'0.25', '0.26'}), frozenset({'0.17', '0.33'})]}
APriori took 699.8140921592712 seconds

In 1994, databases accesses were costly. Today, a dataset of this size fits in RAM. So instead of taking each row, and extracting subsets, we can take each itemset, and count the number of occurences. Theoriticaly complexities are the same.

In [10]:
def aPrioriRAM():
    itemsets = {} # dict of lists of sets
    itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
    last_itemset = itemsets[1] 
    itemset_size = 1
    
    row_list = []
    
    # each row becomes a dict
    for index, row in df.iterrows():
        elements = []      
        
        for header, value in row.iteritems():
            if value == 1:
                elements.append(header)

        row_list.append(frozenset(elements))
    
    while(last_itemset):
        itemset_size += 1

        candidates = apriori_gen(last_itemset)
        for candidate in candidates:
            count = 0
            
            for row in row_list:                        
                if candidate.issubset(row):
                    count = count + 1          
            
            if count/df["0"].count() > MIN_SUPP:
                itemsets.setdefault(itemset_size, []).append(candidate)

        last_itemset = itemsets.get(itemset_size, 0)
        
        
        
        
    return itemsets

       
start_time = time.time()
print(aPrioriRAM())
print("APrioriRAM took {} seconds".format(time.time() - start_time))
{1: [{'0'}, {'0.1'}, {'0.2'}, {'0.3'}, {'0.4'}, {'0.5'}, {'0.7'}, {'0.9'}, {'1.1'}, {'0.11'}, {'0.13'}, {'0.14'}, {'0.15'}, {'0.16'}, {'0.17'}, {'0.18'}, {'0.20'}, {'0.21'}, {'0.22'}, {'0.25'}, {'0.26'}, {'0.27'}, {'0.29'}, {'0.30'}, {'0.31'}, {'0.33'}, {'0.34'}, {'0.35'}, {'0.38'}, {'0.39'}, {'0.40'}, {'0.41'}, {'0.42'}, {'0.43'}, {'0.44'}, {'0.45'}, {'0.46'}, {'0.47'}], 2: [frozenset({'0.25', '0.26'}), frozenset({'0.17', '0.33'})]}
APrioriRAM took 9.511401653289795 seconds

It is better ! In fact, with this version we use python set, and we use set operations wich are more performant (https://stackoverflow.com/questions/27674289/the-complextiy-of-python-issubset) in python that what I did previously. Indeed, it is faster to just check if item's names are in another set (O(1), depending if colision on the hashtable), instead of doing it on a panda serie (O(n)) like I did previously with this line:

if not row[elt]
In [11]:
# we redefine apriori_gen in order to be closer to the paper
# this function need to keep the order or element in sets !
# we could try with an orderedSet, but we don't have that by default in python
# we will just convert itemsets to sorted list
def apriori_gen(itemsets):
    sorted_itemsets = []
    for itemset in itemsets:
        sorted_itemsets.append(sorted(list(itemset)))
        
    C_k = set()
    
    for itemset1 in sorted_itemsets:
        for itemset2 in sorted_itemsets:
            if itemset2[:-1] == itemset1[:-1] and itemset2[-1] != itemset1[-1]:
                
                C_k.add(frozenset(itemset1+ [itemset2[-1]]))
    
    elt_to_remove = set()
    
    for x_itemset in C_k:        
        for elt in x_itemset:
            subset = set([*x_itemset])
            subset.remove(elt)
            if subset not in itemsets:
                elt_to_remove.add(x_itemset)
        
    for remove in elt_to_remove:
        C_k.remove(remove)

    return C_k
                
apriori_gen([{'1', '2', '3'}, {'1', '2', '4'}, {'1', '3', '4'}, {'1', '3', '5'}, {'2', '3', '4'}])
Out[11]:
{frozenset({'1', '2', '3', '4'})}
In [12]:
# Now we check if this new approach is better
start_time = time.time()
print(aPrioriRAM())
print("APrioriRAM took {} seconds".format(time.time() - start_time))
{1: [{'0'}, {'0.1'}, {'0.2'}, {'0.3'}, {'0.4'}, {'0.5'}, {'0.7'}, {'0.9'}, {'1.1'}, {'0.11'}, {'0.13'}, {'0.14'}, {'0.15'}, {'0.16'}, {'0.17'}, {'0.18'}, {'0.20'}, {'0.21'}, {'0.22'}, {'0.25'}, {'0.26'}, {'0.27'}, {'0.29'}, {'0.30'}, {'0.31'}, {'0.33'}, {'0.34'}, {'0.35'}, {'0.38'}, {'0.39'}, {'0.40'}, {'0.41'}, {'0.42'}, {'0.43'}, {'0.44'}, {'0.45'}, {'0.46'}, {'0.47'}], 2: [frozenset({'0.25', '0.26'}), frozenset({'0.17', '0.33'})]}
APrioriRAM took 9.233964681625366 seconds

As you can see my two implementations of apriori_gen are producing similar results. Theoriticaly, the second should be better, because it generates less candidates in the first part. In fact, I don't have databases optimizations to make an efficient SQL query like in the paper, so my implementation is not optimal (self join in O(n^2) ...).

Discovering rules

Now that we have mined frequent itemsets, we can now extract associations rules.

In [13]:
row_list = []
    
for index, row in df.iterrows():
    elements = []      

    for header, value in row.iteritems():
        if value == 1:
            elements.append(header)

    row_list.append(frozenset(elements))

def support(_itemset):
    count = 0
    for row in row_list:     
        if _itemset.issubset(row):
            count = count + 1     
            
    return count

a = set()
a.add('0.1')
print(support(a))    
        

def gen_rules(l_k, a_m, rules):
    subsets = set()
       
    local_subsets = list(combinations(a_m, len(a_m)-1))
    for subset in local_subsets:
        subsets.add(frozenset(subset))
    
    for subset in subsets:
        try:
            conf = support(l_k)/support(subset)
        except ZeroDivisionError:
            conf = 0
            
        if conf > MIN_CONF:
            rules.append([subset, l_k.difference(subset)])
            if len(subset) - 1 > 1:
                gen_rules(l_k, subsets)
                
rules = []
gen_rules({'0.25', '0.26'}, {'0.25', '0.26'}, rules)
print(rules)
6271
[[frozenset({'0.26'}), {'0.25'}], [frozenset({'0.25'}), {'0.26'}]]

So now we can have our final code:

In [14]:
MIN_SUPP = 0.03
MIN_CONF = 0.7

row_list = []
    
for index, row in df.iterrows():
    elements = []      

    for header, value in row.iteritems():
        if value == 1:
            elements.append(header)

    row_list.append(frozenset(elements))
    
    
def is_frequent(itemset, min_supp):
    count = 0
    for _, row in df.iterrows():
        is_line_valid = True
        for item in itemset:
            if row[item] == 0:
                is_line_valid = False
        
        if is_line_valid:
            count += 1
    frequency = count/df["0"].count()
    
    if frequency >= min_supp:
        return True
    
    return False      

def itemset_1_frequent(df, min_supp):
    sum_column = df.sum()
    result = []
    
    for index, count in sum_column.items():
        if count/df["0"].count() > min_supp:
            result.append(set([index]))
            
    return result

def apriori_gen(itemsets):
    sorted_itemsets = []
    for itemset in itemsets:
        sorted_itemsets.append(sorted(list(itemset)))
        
    C_k = set()
    
    for itemset1 in sorted_itemsets:
        for itemset2 in sorted_itemsets:
            if itemset2[:-1] == itemset1[:-1] and itemset2[-1] != itemset1[-1]:
                
                C_k.add(frozenset(itemset1+ [itemset2[-1]]))
    
    elt_to_remove = set()
    
    for x_itemset in C_k:        
        for elt in x_itemset:
            subset = set([*x_itemset])
            subset.remove(elt)
            if subset not in itemsets:
                elt_to_remove.add(x_itemset)
        
    for remove in elt_to_remove:
        C_k.remove(remove)

    return C_k

def support(_itemset):
    count = 0
    for row in row_list:     
        if _itemset.issubset(row):
            count = count + 1     
            
    return count
        

def gen_rules(l_k, a_m, rules):
    subsets = set()
       
    local_subsets = list(combinations(a_m, len(a_m)-1))
    for subset in local_subsets:
        subsets.add(frozenset(subset))
    
    for subset in subsets:
        try:
            conf = support(l_k)/support(subset)
        except ZeroDivisionError:
            conf = 0
            
        if conf > MIN_CONF:
            rules.append([subset, l_k.difference(subset)])
            if len(subset) - 1 > 1:
                gen_rules(l_k, subsets)
                
def aPrioriRAM():
    itemsets = {} # dict of lists of sets
    itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
    last_itemset = itemsets[1] 
    itemset_size = 1
    
    row_list = []
    
    # each row becomes a dict
    for index, row in df.iterrows():
        elements = []      
        
        for header, value in row.iteritems():
            if value == 1:
                elements.append(header)

        row_list.append(frozenset(elements))
    
    while(last_itemset):
        itemset_size += 1

        candidates = apriori_gen(last_itemset)
        for candidate in candidates:
            count = 0
            
            for row in row_list:                        
                if candidate.issubset(row):
                    count = count + 1          
            
            if count/df["0"].count() > MIN_SUPP:
                itemsets.setdefault(itemset_size, []).append(candidate)

        last_itemset = itemsets.get(itemset_size, 0)
    
    rules = []
    for k, itemset in itemsets.items():
        if k >= 2:
            for itemset_local in itemset:
                gen_rules(itemset_local, itemset_local, rules)
        
    return rules

start_time = time.time()
rules = aPrioriRAM()

print("Rules:")

for rule in rules:
    print("{} => {}".format(rule[0], rule[1]))

print("APrioriRAM took {} seconds".format(time.time() - start_time))
Rules:
frozenset({'0.7', '1.1'}) => frozenset({'0.35'})
frozenset({'0.7', '0.35'}) => frozenset({'1.1'})
frozenset({'1.1', '0.35'}) => frozenset({'0.7'})
frozenset({'0.3', '0.17'}) => frozenset({'0.33'})
frozenset({'0.3', '0.33'}) => frozenset({'0.17'})
frozenset({'0.17', '0.33'}) => frozenset({'0.3'})
frozenset({'0.30', '0.43'}) => frozenset({'0.15'})
frozenset({'0.15', '0.30'}) => frozenset({'0.43'})
frozenset({'0.15', '0.43'}) => frozenset({'0.30'})
frozenset({'0.44', '0'}) => frozenset({'0.2'})
frozenset({'0.2', '0'}) => frozenset({'0.44'})
frozenset({'0.2', '0.44'}) => frozenset({'0'})
APrioriRAM took 13.434985160827637 seconds

Commentaires

blog comments powered by Disqus