关联分析——频繁项集的产生之Apriori算法
频繁项集的产生—Apriori算法
Apriori算法用于从数据集中提取频繁项集,以购物篮事务为例说明其过程:
提取频繁项集的过程如下:
Apriori算法的伪码如下:
; Apriori算法的Python实现
给出数据集:
data = [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
提取1-项集
def createC1(data):
c1 = []
for task in data:
for item in task:
if [item] not in c1:
c1.append([item])
c1.sort()
return list(map(frozenset, c1))
c1 = createC1(data)
c1
[frozenset({1}),
frozenset({2}),
frozenset({3}),
frozenset({4}),
frozenset({5})]
提取频繁k-项集
基于支持度的剪枝原理可用下图表示:
其原理在于如果一个项集是频繁的,则它的所有子集一定也是频繁的。根据逆否命题可得:如果一个项集的某个子集是非频繁的,则该项集一定是非频繁的。
def scanData(data, C_k, minSupport):
ssCnt = {}
for task in data:
for can in C_k:
if can.issubset(task):
ssCnt[can] = ssCnt.get(can, 0) + 1
retlist = []
F_k = {}
for key in ssCnt:
support = ssCnt[key] / len(data)
if support >= minSupport:
retlist.insert(0, key)
F_k[key] = support
return retlist, F_k
F1_l, F_1 = scanData(data, c1, 0.5)
print(F1_l)
[frozenset({5}), frozenset({2}), frozenset({3}), frozenset({1})]
print(F_1)
{frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({2}): 0.75, frozenset({5}): 0.75}
对应伪码的第2步和第12步。
生成候选k-项集
生成候选k-项集采用F(k-1) * F(k-1)方法,其原理如下:
def aprioriGen(F_l, k):
C_l = []
for i in range(len(F_l)-1):
for j in range(i+1, len(F_l)):
l1 = list(F_l[i])[:k-2]
l2 = list(F_l[j])[:k-2]
l1.sort()
l2.sort()
if l1 == l2:
C_l.append(F_l[i] | F_l[j])
return C_l
C2_l = aprioriGen(F1_l, 2)
C2_l
[frozenset({2, 5}),
frozenset({3, 5}),
frozenset({1, 5}),
frozenset({2, 3}),
frozenset({1, 2}),
frozenset({1, 3})]
对应伪码的第5步。
Apriori算法
def apriori(data, minsupport):
C1 = createC1(data)
F1_l, F1 = scanData(data, C1, minsupport)
L = [F1_l]
F = F1
k = 2
while len(L[k-2]) > 0:
Ck_l = aprioriGen(L[k-2], k)
Fk_l, Fk = scanData(data, Ck_l, minsupport)
L.append(Fk_l)
F.update(Fk)
k += 1
return L, F
L, F = apriori(data, 0.2)
print(L)
[[frozenset({5}), frozenset({2}), frozenset({4}), frozenset({3}), frozenset({1})], [frozenset({1, 2}), frozenset({1, 5}), frozenset({2, 3}), frozenset({3, 5}), frozenset({2, 5}), frozenset({1, 3}), frozenset({1, 4}), frozenset({3, 4})], [frozenset({1, 3, 5}), frozenset({1, 2, 3}), frozenset({1, 2, 5}), frozenset({2, 3, 5}), frozenset({1, 3, 4})], [frozenset({1, 2, 3, 5})], []]
print(F)
{frozenset({1}): 0.5, frozenset({3}): 0.75, frozenset({4}): 0.25, frozenset({2}): 0.75, frozenset({5}): 0.75, frozenset({3, 4}): 0.25, frozenset({1, 4}): 0.25, frozenset({1, 3}): 0.5, frozenset({2, 5}): 0.75, frozenset({3, 5}): 0.5, frozenset({2, 3}): 0.5, frozenset({1, 5}): 0.25, frozenset({1, 2}): 0.25, frozenset({1, 3, 4}): 0.25, frozenset({2, 3, 5}): 0.5, frozenset({1, 2, 5}): 0.25, frozenset({1, 2, 3}): 0.25, frozenset({1, 3, 5}): 0.25, frozenset({1, 2, 3, 5}): 0.25}
封装
class Apriori:
def __init__(self, minsupport):
self.minsupport = minsupport
def createC1(self, data):
c1 = []
for task in data:
for item in task:
if [item] not in c1:
c1.append([item])
c1.sort()
return list(map(frozenset, c1))
def scanData(self, data, C_k, minSupport):
ssCnt = {}
for task in data:
for can in C_k:
if can.issubset(task):
ssCnt[can] = ssCnt.get(can, 0) + 1
retlist = []
F_k = {}
for key in ssCnt:
support = ssCnt[key] / len(data)
if support >= minSupport:
retlist.insert(0, key)
F_k[key] = support
return retlist, F_k
def aprioriGen(self, F_l, k):
C_l = []
for i in range(len(F_l)-1):
for j in range(i+1, len(F_l)):
l1 = list(F_l[i])[:k-2]
l2 = list(F_l[j])[:k-2]
l1.sort()
l2.sort()
if l1 == l2:
C_l.append(F_l[i] | F_l[j])
return C_l
def apriori(self, data):
minsupport = self.minsupport
C1 = self.createC1(data)
F1_l, F1 = self.scanData(data, C1, minsupport)
L = [F1_l]
F = F1
k = 2
while len(L[k-2]) > 0:
Ck_l = self.aprioriGen(L[k-2], k)
Fk_l, Fk = self.scanData(data, Ck_l, minsupport)
L.append(Fk_l)
F.update(Fk)
k += 1
return L, F
Original: https://blog.csdn.net/PythonLearner_MJ/article/details/118692693
Author: 写BUG的Jerry
Title: 关联分析——频繁项集的产生之Apriori算法
原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/699972/
转载文章受原作者版权保护。转载请注明原作者出处!