# A Priori implementation

In this notebook, we will implement the algorithm of the Apriori algorithm as described in "Fast algorithms for mining association rules", Rakesh Agrawal, Ramakrishnan Srikant.

First we will try a dummy implementation, then we will try to use correct data-structures.

## Naive approach¶

```
import pandas as pd
import numpy as np
import time
from itertools import combinations
df = pd.read_csv('./data.csv')
df = df.drop('1', 1)
df.head(10)
```

```
MIN_SUPP = 0.05
MIN_CONF = 0.05
```

```
def is_frequent(itemset, min_supp):
count = 0
for _, row in df.iterrows():
is_line_valid = True
for item in itemset:
if row[item] == 0:
is_line_valid = False
if is_line_valid:
count += 1
frequency = count/df["0"].count()
if frequency >= min_supp:
return True
return False
```

```
def itemset_1_frequent(df, min_supp):
sum_column = df.sum()
result = []
for index, count in sum_column.items():
if count/df["0"].count() > min_supp:
result.append(set([index]))
return result
itemsets = {} # dict of lists of sets
itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
print(itemsets)
```

```
def apriori_gen(itemsets):
# simple approach, not the one on the paper
# it works but does not seem as optimized as the paper approach
items = set()
for x_itemset in itemsets:
for item in x_itemset:
items.add(item)
C_k = set()
for x_itemset in itemsets:
for item in items:
if item not in x_itemset:
C_k.add(frozenset([*x_itemset, item]))
elt_to_remove = set()
for x_itemset in C_k:
for elt in x_itemset:
subset = set([*x_itemset])
subset.remove(elt)
if subset not in itemsets:
elt_to_remove.add(x_itemset)
for remove in elt_to_remove:
C_k.remove(remove)
return C_k
apriori_gen([{'1', '2', '3'}, {'1', '2', '4'}, {'1', '3', '4'}, {'1', '3', '5'}, {'2', '3', '4'}])
```

In aPriori, the are three ways of stating that an itemset is infrequent. It could be because:

- it is not generated (in the first part of apriori_gen)
- it is pruned (second part of apriori_gen, based on the fact that if the itemset {beer} is non-frequent, the itemset {beer, milk} is non-frequent too)
- it does not have minimum support (we count in data)

With the approach here, the number of generated itemsets is greater than the one on the paper, leading to a diminution of performances.

```
def subset(candidates, row):
""" Filter all candidates itemset wich are present in the row
"""
# dummy solution
subset = set()
for candidate in candidates:
add = True
for elt in candidate:
if not row[elt]:
add = False
if add:
subset.add(frozenset(candidate))
return subset
candidates = [{'0.7', '0.35'}, {'0', '0.1'}]
row = next(df.iterrows())[1]
print(subset(candidates, row))
```

```
def aPriori():
itemsets = {} # dict of lists of sets
itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
last_itemset = itemsets[1]
itemset_size = 1
while(last_itemset):
itemset_size += 1
candidates = apriori_gen(last_itemset)
candidates_count = {}
for index, row in df.iterrows():
# print(row)
local_candidates = subset(candidates, row)
for local_candidate in local_candidates:
candidates_count[id(local_candidate)] = candidates_count.get(id(local_candidate), 0) + 1
# check minimum support
for candidate in candidates:
if candidates_count[id(candidate)]/df["0"].count() > MIN_SUPP:
itemsets.setdefault(itemset_size, []).append(candidate)
last_itemset = itemsets.get(itemset_size, 0)
return itemsets
print(aPriori())
# let's print the time it takes with this approach
start_time = time.time()
aPriori()
print("APriori took {} seconds".format(time.time() - start_time))
```

In 1994, databases accesses were costly. Today, a dataset of this size fits in RAM. So instead of taking each row, and extracting subsets, we can take each itemset, and count the number of occurences. Theoriticaly complexities are the same.

```
def aPrioriRAM():
itemsets = {} # dict of lists of sets
itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
last_itemset = itemsets[1]
itemset_size = 1
row_list = []
# each row becomes a dict
for index, row in df.iterrows():
elements = []
for header, value in row.iteritems():
if value == 1:
elements.append(header)
row_list.append(frozenset(elements))
while(last_itemset):
itemset_size += 1
candidates = apriori_gen(last_itemset)
for candidate in candidates:
count = 0
for row in row_list:
if candidate.issubset(row):
count = count + 1
if count/df["0"].count() > MIN_SUPP:
itemsets.setdefault(itemset_size, []).append(candidate)
last_itemset = itemsets.get(itemset_size, 0)
return itemsets
start_time = time.time()
print(aPrioriRAM())
print("APrioriRAM took {} seconds".format(time.time() - start_time))
```

It is better ! In fact, with this version we use python set, and we use set operations wich are more performant (https://stackoverflow.com/questions/27674289/the-complextiy-of-python-issubset) in python that what I did previously. Indeed, it is faster to just check if item's names are in another set (O(1), depending if colision on the hashtable), instead of doing it on a panda serie (O(n)) like I did previously with this line:

```
if not row[elt]
```

```
# we redefine apriori_gen in order to be closer to the paper
# this function need to keep the order or element in sets !
# we could try with an orderedSet, but we don't have that by default in python
# we will just convert itemsets to sorted list
def apriori_gen(itemsets):
sorted_itemsets = []
for itemset in itemsets:
sorted_itemsets.append(sorted(list(itemset)))
C_k = set()
for itemset1 in sorted_itemsets:
for itemset2 in sorted_itemsets:
if itemset2[:-1] == itemset1[:-1] and itemset2[-1] != itemset1[-1]:
C_k.add(frozenset(itemset1+ [itemset2[-1]]))
elt_to_remove = set()
for x_itemset in C_k:
for elt in x_itemset:
subset = set([*x_itemset])
subset.remove(elt)
if subset not in itemsets:
elt_to_remove.add(x_itemset)
for remove in elt_to_remove:
C_k.remove(remove)
return C_k
apriori_gen([{'1', '2', '3'}, {'1', '2', '4'}, {'1', '3', '4'}, {'1', '3', '5'}, {'2', '3', '4'}])
```

```
# Now we check if this new approach is better
start_time = time.time()
print(aPrioriRAM())
print("APrioriRAM took {} seconds".format(time.time() - start_time))
```

As you can see my two implementations of apriori_gen are producing similar results. Theoriticaly, the second should be better, because it generates less candidates in the first part. In fact, I don't have databases optimizations to make an efficient SQL query like in the paper, so my implementation is not optimal (self join in O(n^2) ...).

### Discovering rules¶

Now that we have mined frequent itemsets, we can now extract associations rules.

```
row_list = []
for index, row in df.iterrows():
elements = []
for header, value in row.iteritems():
if value == 1:
elements.append(header)
row_list.append(frozenset(elements))
def support(_itemset):
count = 0
for row in row_list:
if _itemset.issubset(row):
count = count + 1
return count
a = set()
a.add('0.1')
print(support(a))
def gen_rules(l_k, a_m, rules):
subsets = set()
local_subsets = list(combinations(a_m, len(a_m)-1))
for subset in local_subsets:
subsets.add(frozenset(subset))
for subset in subsets:
try:
conf = support(l_k)/support(subset)
except ZeroDivisionError:
conf = 0
if conf > MIN_CONF:
rules.append([subset, l_k.difference(subset)])
if len(subset) - 1 > 1:
gen_rules(l_k, subsets)
rules = []
gen_rules({'0.25', '0.26'}, {'0.25', '0.26'}, rules)
print(rules)
```

So now we can have our final code:

```
MIN_SUPP = 0.03
MIN_CONF = 0.7
row_list = []
for index, row in df.iterrows():
elements = []
for header, value in row.iteritems():
if value == 1:
elements.append(header)
row_list.append(frozenset(elements))
def is_frequent(itemset, min_supp):
count = 0
for _, row in df.iterrows():
is_line_valid = True
for item in itemset:
if row[item] == 0:
is_line_valid = False
if is_line_valid:
count += 1
frequency = count/df["0"].count()
if frequency >= min_supp:
return True
return False
def itemset_1_frequent(df, min_supp):
sum_column = df.sum()
result = []
for index, count in sum_column.items():
if count/df["0"].count() > min_supp:
result.append(set([index]))
return result
def apriori_gen(itemsets):
sorted_itemsets = []
for itemset in itemsets:
sorted_itemsets.append(sorted(list(itemset)))
C_k = set()
for itemset1 in sorted_itemsets:
for itemset2 in sorted_itemsets:
if itemset2[:-1] == itemset1[:-1] and itemset2[-1] != itemset1[-1]:
C_k.add(frozenset(itemset1+ [itemset2[-1]]))
elt_to_remove = set()
for x_itemset in C_k:
for elt in x_itemset:
subset = set([*x_itemset])
subset.remove(elt)
if subset not in itemsets:
elt_to_remove.add(x_itemset)
for remove in elt_to_remove:
C_k.remove(remove)
return C_k
def support(_itemset):
count = 0
for row in row_list:
if _itemset.issubset(row):
count = count + 1
return count
def gen_rules(l_k, a_m, rules):
subsets = set()
local_subsets = list(combinations(a_m, len(a_m)-1))
for subset in local_subsets:
subsets.add(frozenset(subset))
for subset in subsets:
try:
conf = support(l_k)/support(subset)
except ZeroDivisionError:
conf = 0
if conf > MIN_CONF:
rules.append([subset, l_k.difference(subset)])
if len(subset) - 1 > 1:
gen_rules(l_k, subsets)
def aPrioriRAM():
itemsets = {} # dict of lists of sets
itemsets[1] = itemset_1_frequent(df, MIN_SUPP)
last_itemset = itemsets[1]
itemset_size = 1
row_list = []
# each row becomes a dict
for index, row in df.iterrows():
elements = []
for header, value in row.iteritems():
if value == 1:
elements.append(header)
row_list.append(frozenset(elements))
while(last_itemset):
itemset_size += 1
candidates = apriori_gen(last_itemset)
for candidate in candidates:
count = 0
for row in row_list:
if candidate.issubset(row):
count = count + 1
if count/df["0"].count() > MIN_SUPP:
itemsets.setdefault(itemset_size, []).append(candidate)
last_itemset = itemsets.get(itemset_size, 0)
rules = []
for k, itemset in itemsets.items():
if k >= 2:
for itemset_local in itemset:
gen_rules(itemset_local, itemset_local, rules)
return rules
start_time = time.time()
rules = aPrioriRAM()
print("Rules:")
for rule in rules:
print("{} => {}".format(rule[0], rule[1]))
print("APrioriRAM took {} seconds".format(time.time() - start_time))
```

## Commentaires