Educoder关联规则挖掘

第一关:数据探索和预处理

本实训中,实验内容为完成数据探索和预处理,根据提示,在右侧编辑器补充代码,完成如下四个任务:

  1. 使用 pandas库的 read_excel方法读入实验数据集;
  2. 使用 info()函数, 观察数据属性类型并判断是否符合算法要求;
  3. 若不符合要求,请选择合适的策略对数据进行离散化处理,即将六种类型数据分别标记为’ABCDEF’,再使用等频离散进行离散化。
import pandas as pd
def Task():
    # 使用pandas库的read_excel方法读入数据中医数据
    #*************** BEIGN ******************
    data = pd.read_excel('data/data.xls')
    answer_1 = data.head(5)
    #**************** END *******************
    #*************** BEIGN ******************
    #观察数据属性类型是否符合算法要求
    info = data.info()
    answer_2 = info
    #**************** END *******************
    #*************** BEIGN ******************
    # 使用合适的策略对数据进行离散化处理
    typelabel = dict(zip(data.columns[:-1], 'ABCDEF'))
    #**************** END *******************
    keys = list(typelabel.keys())
    answer_3 = keys
    # 等频离散
    k = 4
    datas = list()
    #*************** BEIGN ******************
    for j in keys:
        label = [typelabel[j]+str(i+1) for i in range(k)]
        d = pd.qcut(data[j], k, labels=label)
        datas.append(d)
    #**************** END *******************
    datas.append(data['TNM分期'])
    # 经过离散化处理后的数据集
    datas = pd.DataFrame(datas).T
    answer_4 = datas.head(5)
    #将离散化后的数据集存储到apriori.txt文件中
    filepath = 'data/apriori.txt'
    datas.to_csv(filepath, header=0, index=0, sep=',')
    return answer_1, answer_2, answer_3, answer_4

第二关:FP增长树算法调用

根据提示,在右侧编辑器补充代码,调用FP增长树算法,实现寻找频繁项集。其中, minimum_support设置为 56include_support设置为 True


(1)FP-growth算法将数据存储在一种称为FP树的紧凑数据结构中。由于FP树比其他树更加复杂,因此需要一个类来保存树的每一个节点。首先,定义FP树的节点类。

class FPNode(object):
    """FP树中的节点"""

    def __init__(self, tree, item, count=1):
        self._tree = tree
        self._item = item
        self._count = count
        self._parent = None
        self._children = {}
        self._neighbor = None

    def add(self, child):
        """将给定的fp'孩子'节点添加为该节点的子节点"""

        if not isinstance(child, FPNode):
            raise TypeError("Can only add other FPNodes as children")

        if not child.item in self._children:
            self._children[child.item] = child
            child.parent = self

    def search(self, item):
"""
       检查此节点是否包含给定项的子节点。
         如果是,则返回该节点; 否则,返回None
"""
        try:
            return self._children[item]
        except KeyError:
            return None

    def __contains__(self, item):
        return item in self._children

    @property
    def tree(self):
        """出现此节点的树"""
        return self._tree

    @property
    def item(self):
        """此节点中包含的项"""
        return self._item

    @property
    def count(self):
        """与此节点项相关的计数。"""
        return self._count

    def increment(self):
        """增加与此节点项相关联的计数。"""
        if self._count is None:
            raise ValueError("Root nodes have no associated count.")
        self._count += 1

    @property
    def root(self):
        """如果此节点是树的根,则为true;否则为false"""
        return self._item is None and self._count is None

    @property
    def leaf(self):
        """如果此节点是树中的叶子,则为true;否则为false"""
        return len(self._children) == 0

    @property
    def parent(self):
        """父节点"""
        return self._parent

    @parent.setter
    def parent(self, value):
        if value is not None and not isinstance(value, FPNode):
            raise TypeError("A node must have an FPNode as a parent.")
        if value and value.tree is not self.tree:
            raise ValueError("Cannot have a parent from another tree.")
        self._parent = value

    @property
    def neighbor(self):
"""
       节点的邻居;在树中具有相同值的“右边”
"""
        return self._neighbor

    @neighbor.setter
    def neighbor(self, value):
        if value is not None and not isinstance(value, FPNode):
            raise TypeError("A node must have an FPNode as a neighbor.")
        if value and value.tree is not self.tree:
            raise ValueError("Cannot have a neighbor from another tree.")
        self._neighbor = value

    @property
    def children(self):
        """该节点的子节点"""
        return tuple(self._children.itervalues())

    def inspect(self, depth=0):
        print ('  ' * depth) + repr(self)
        for child in self.children:
            child.inspect(depth + 1)

    def __repr__(self):
        if self.root:
            return "<%s (root)>" % type(self).__name__
        return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count)

&#xFF08;2&#xFF09; &#x5B9A;&#x4E49;Fptree&#x51FD;&#x6570;&#x6765;&#x6784;&#x5EFA;FP&#x6811;

from collections import defaultdict, namedtuple
class FPTree(object):
    Route = namedtuple('Route', 'head tail')

    def __init__(self):
        # &#x6811;&#x7684;&#x6839;&#x8282;&#x70B9;.

        self._root = FPNode(self, None, None)

        #&#x5B57;&#x5178;&#x5C06;&#x9879;&#x76EE;&#x6620;&#x5C04;&#x5230;&#x8DEF;&#x5F84;&#x7684;&#x5934;&#x90E8;&#x548C;&#x5C3E;&#x90E8;
        # "neighbors" that will hit every node containing that item.

        self._routes = {}

    @property
    def root(self):
        #&#x6811;&#x7684;&#x6839;&#x8282;&#x70B9;.

        return self._root

    def add(self, transaction):
        #&#x5C06;&#x4E8B;&#x52A1;&#x6DFB;&#x52A0;&#x5230;&#x6811;&#x4E2D;
        point = self._root

        for item in transaction:
            next_point = point.search(item)
            if next_point:
                next_point.increment()
            else:
                next_point = FPNode(self, item)
                point.add(next_point)

                self._update_route(next_point)

            point = next_point

    def _update_route(self, point):
        assert self is point.tree

        try:
            route = self._routes[point.item]
            route[1].neighbor = point # route[1] &#x662F;&#x5C3E;&#x90E8;
            self._routes[point.item] = self.Route(route[0], point)
        except KeyError:
            # &#x5F00;&#x59CB;&#x4E00;&#x4E2A;&#x65B0;&#x8DEF;&#x5F84;
            self._routes[point.item] = self.Route(point, point)

    def items(self):
"""
       &#x4E3A;&#x6811;&#x4E2D;&#x8868;&#x793A;&#x7684;&#x6BCF;&#x4E2A;&#x9879;&#x751F;&#x6210;&#x4E00;&#x4E2A;2&#x5143;&#x7EC4;&#x3002; &#x5143;&#x7EC4;&#x7684;&#x7B2C;&#x4E00;&#x4E2A;&#x5143;&#x7D20;&#x662F;&#x9879;&#x672C;&#x8EAB;&#xFF0C;
       &#x7B2C;&#x4E8C;&#x4E2A;&#x5143;&#x7D20;&#x662F;&#x4E00;&#x4E2A;&#x751F;&#x6210;&#x5668;&#xFF0C;&#x5B83;&#x5C06;&#x751F;&#x6210;&#x6811;&#x4E2D;&#x5C5E;&#x4E8E;&#x8BE5;&#x9879;&#x7684;&#x8282;&#x70B9;&#x3002;
"""
        for item in self._routes.keys():
            yield (item, self.nodes(item))

    def nodes(self, item):
"""
        &#x751F;&#x6210;&#x5305;&#x542B;&#x7ED9;&#x5B9A;&#x9879;&#x7684;&#x8282;&#x70B9;&#x5E8F;&#x5217;
"""

        try:
            node = self._routes[item][0]
        except KeyError:
            return

        while node:
            yield node
            node = node.neighbor

    def prefix_paths(self, item):
        """&#x751F;&#x6210;&#x4EE5;&#x7ED9;&#x5B9A;&#x9879;&#x7ED3;&#x5C3E;&#x7684;&#x524D;&#x7F00;&#x8DEF;&#x5F84;."""

        def collect_path(node):
            path = []
            while node and not node.root:
                path.append(node)
                node = node.parent
            path.reverse()
            return path

        return (collect_path(node) for node in self.nodes(item))

    def inspect(self):
        print( 'Tree:')
        self.root.inspect(1)

        print()
        print ('Routes:')
        for item, nodes in self.items():
            print ('  %r' % item)
            for node in nodes:
                print( '    %r' % node)

&#xFF08;3&#xFF09;&#x5B9A;&#x4E49;conditional_tree_from_paths()&#xFF0C;&#x4ECE;&#x7ED9;&#x5B9A;&#x7684;&#x524D;&#x7F00;&#x8DEF;&#x5F84;&#x6784;&#x5EFA;&#x6761;&#x4EF6;FP&#x6811;&#x3002;&#x524D;&#x7F00;&#x8DEF;&#x5F84;&#x662F;&#x4ECB;&#x4E8E;&#x6240;&#x67E5;&#x627E;&#x5143;&#x7D20;&#x9879;&#x4E0E;&#x6811;&#x6839;&#x8282;&#x70B9;&#x4E4B;&#x95F4;&#x7684;&#x6240;&#x6709;&#x5185;&#x5BB9;&#x3002;

def conditional_tree_from_paths(paths):
    """&#x4ECE;&#x7ED9;&#x5B9A;&#x7684;&#x524D;&#x7F00;&#x8DEF;&#x5F84;&#x6784;&#x5EFA;&#x6761;&#x4EF6;FP&#x6811;."""
    tree = FPTree()
    condition_item = None
    items = set()

    #&#x5C06;&#x8DEF;&#x5F84;&#x4E2D;&#x7684;&#x8282;&#x70B9;&#x5BFC;&#x5165;&#x65B0;&#x6811;&#x3002;&#x53EA;&#x6709;&#x53F6;&#x5B50;&#x8282;&#x70B9;&#x7684;&#x8BA1;&#x6570;&#x624D;&#x91CD;&#x8981;;&#x5269;&#x4E0B;&#x7684;&#x8BA1;&#x6570;&#x5C06;&#x6839;&#x636E;&#x53F6;&#x5B50;&#x8282;&#x70B9;&#x7684;&#x8BA1;&#x6570;&#x8FDB;&#x884C;&#x91CD;&#x6784;&#x3002;
    for path in paths:
        if condition_item is None:
            condition_item = path[-1].item

        point = tree.root
        for node in path:
            next_point = point.search(node.item)
            if not next_point:
                # Add a new node to the tree.

                items.add(node.item)
                count = node.count if node.item == condition_item else 0
                next_point = FPNode(tree, node.item, count)
                point.add(next_point)
                tree._update_route(next_point)
            point = next_point

    assert condition_item is not None

    # Calculate the counts of the non-leaf nodes.

    for path in tree.prefix_paths(condition_item):
        count = path[-1].count
        for node in reversed(path[:-1]):
            node._count += count

    return tree

&#xFF08;4&#xFF09;&#x4ECE;&#x6784;&#x5EFA;&#x7684;&#x6761;&#x4EF6;FP&#x6811;&#x4E2D;&#x5BFB;&#x627E;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x3002;
#
&#x5728;&#x7ED9;&#x5B9A;&#x7684;&#x4E8B;&#x52A1;&#x4E2D;&#x4F7F;&#x7528;FP-growth&#x67E5;&#x627E;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x3002;&#x8BE5;&#x51FD;&#x6570;&#x8FD4;&#x56DE;&#x4E00;&#x4E2A;&#x751F;&#x6210;&#x5668;&#xFF0C;&#x800C;&#x4E0D;&#x662F;&#x5FEB;&#x901F;&#x586B;&#x5145;&#x7684;&#x9879;&#x5217;&#x8868;&#x3002; &#x201C;&#x4E8B;&#x52A1;&#x201D;&#x53C2;&#x6570;&#x53EF;&#x4EE5;&#x662F;&#x9879;&#x7684;&#x4EFB;&#x4F55;&#x53EF;&#x8FED;&#x4EE3;&#x9879;&#x3002; &#x201C;minimum_support&#x201D;&#x5E94;&#x8BE5;&#x662F;&#x4E00;&#x4E2A;&#x6574;&#x6570;&#xFF0C;&#x5B83;&#x6307;&#x5B9A;&#x8981;&#x63A5;&#x53D7;&#x7684;&#x9879;&#x96C6;&#x51FA;&#x73B0;&#x7684;&#x6700;&#x5C0F;&#x6570;&#x91CF;&#x3002; &#x5982;&#x679C;include_support&#x4E3A;true&#xFF0C;&#x5219;yield&#xFF08;itemset&#xFF0C;support&#xFF09;&#x5BF9;&#xFF0C;&#x800C;&#x4E0D;&#x4EC5;&#x4EC5;&#x662F;&#x9879;&#x96C6;&#x3002;

def find_frequent_itemsets(transactions, minimum_support, include_support=False):
    items = defaultdict(lambda: 0) #&#x4ECE;&#x9879;&#x5230;&#x5176;&#x652F;&#x6301;&#x5EA6;&#x7684;&#x6620;&#x5C04;

    #&#x52A0;&#x8F7D;&#x4F20;&#x5165;&#x7684;&#x4E8B;&#x52A1;&#x5E76;&#x8BA1;&#x7B97;&#x5355;&#x4E2A;&#x9879;&#x7684;&#x652F;&#x6301;&#x5EA6;
    for transaction in transactions:
        for item in transaction:
            items[item] += 1

    #&#x4ECE;&#x9879;&#x652F;&#x6301;&#x5B57;&#x5178;&#x4E2D;&#x5220;&#x9664;&#x4E0D;&#x5E38;&#x89C1;&#x7684;&#x9879;&#x3002;
    items = dict((item, supports) for item, supports in items.items()
        if supports >= minimum_support)

    #&#x5EFA;&#x7ACB;FP-tree&#x3002; &#x5728;&#x4EFB;&#x4F55;&#x4E8B;&#x52A1;&#x53EF;&#x4EE5;&#x88AB;&#x6DFB;&#x52A0;&#x5230;&#x6811;&#x4E4B;&#x524D;&#xFF0C;&#x4ED6;&#x4EEC;&#x5FC5;&#x987B;&#x88AB;&#x5265;&#x593A;&#x4E0D;&#x5E38;&#x51FA;&#x73B0;&#x7684;&#x9879;&#xFF0C;&#x5E76;&#x4E14;&#x5269;&#x4F59;&#x7684;&#x9879;&#x5FC5;&#x987B;&#x6309;&#x7167;&#x9891;&#x7387;&#x7684;&#x964D;&#x5E8F;&#x6392;&#x5E8F;&#x3002;
    def clean_transaction(transaction):
        transaction = filter(lambda v: v in items, transaction)
        transaction_list = list(transaction)  # &#x4E3A;&#x4E86;&#x9632;&#x6B62;&#x53D8;&#x91CF;&#x5728;&#x5176;&#x4ED6;&#x90E8;&#x5206;&#x8C03;&#x7528;&#xFF0C;&#x8FD9;&#x91CC;&#x5F15;&#x5165;&#x4E34;&#x65F6;&#x53D8;&#x91CF;transaction_list
        transaction_list.sort(key=lambda v: items[v], reverse=True)
        return transaction_list

    master = FPTree()
    for transaction in map(clean_transaction, transactions):
        master.add(transaction)
    def find_with_suffix(tree, suffix):
        for item, nodes in tree.items():
            supports = sum(nn.count for nn in nodes)
            if supports >= minimum_support and item not in suffix:
                # &#x65B0;&#x8D62;&#x5BB6;
                found_set = [item] + suffix
                yield (found_set, supports) if include_support else found_set

                # #&#x4ECE;&#x9879;&#x652F;&#x6301;&#x5B57;&#x5178;&#x4E2D;&#x5220;&#x9664;&#x4E0D;&#x5E38;&#x89C1;&#x7684;&#x9879;&#x3002;
                cond_tree = conditional_tree_from_paths(tree.prefix_paths(item))
                for s in find_with_suffix(cond_tree, found_set):
                    yield s # pass along the good news to our caller
    # &#x641C;&#x7D22;&#x9891;&#x7E41;&#x7684;&#x9879;&#x76EE;&#x96C6;&#xFF0C;&#x5E76;&#x4EA7;&#x751F;&#x6211;&#x4EEC;&#x627E;&#x5230;&#x7684;&#x7ED3;&#x679C;&#x3002;
    for itemset in find_with_suffix(master, []):
        yield itemset

&#x8C03;&#x7528;FP&#x589E;&#x957F;&#x6811;&#x7B97;&#x6CD5;&#xFF0C;&#x5B9E;&#x73B0;&#x5BFB;&#x627E;&#x9891;&#x7E41;&#x9879;&#x96C6;

def load_data(filename):
    data=list()
    with open(filename,'r',encoding='utf-8') as f:
        for line in f.readlines():
            linestr=line.strip()
            linestrlist=linestr.split(",")
            data.append(linestrlist)
    return data

dataset = load_data('data/apriori.txt')  # &#x8BFB;&#x53D6;&#x6570;&#x636E;

#*************** BEIGN ******************
frequent_itemsets = find_frequent_itemsets(transactions=dataset, minimum_support=56, include_support=True)
#**************** END *******************

print(type(frequent_itemsets))  # print type

result = []
for itemset, supports in frequent_itemsets:    # &#x5C06;generator&#x7ED3;&#x679C;&#x5B58;&#x5165;list
    result.append((itemset, supports))

result = sorted(result, key=lambda i: i[0])   # &#x6392;&#x5E8F;&#x540E;&#x8F93;&#x51FA;
for itemset, supports in result:
    print(str(itemset) + ' ' + str(float(supports/len(dataset))))

print(len(result))  # &#x8F93;&#x51FA;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x4E2A;&#x6570;
</%s></%s>

第三关:Apriori算法

本关任务:根据Apriori算法步骤,编写代码实现寻找频繁项集。

算法步骤

Educoder关联规则挖掘

对比,FP算法和Apriori算法,其寻找的频繁项集的结果是相同的,但是FP算法的运行时间明显小于Apriori算法的运行时间。这是因为apriori算法需要多次扫描数据集,每次利用候选频繁集产生频繁集;而FP-growth则利用树形结构,无需产生候选频繁集而是直接得到频繁集,大大减少扫描数据集的次数,从而提高了算法的效率。

本关任务

根据下面的文字提示,在右侧编辑器补充代码,在已有的代码框架下实现函数功能,完成实验。

&#x6839;&#x636E;Apriori&#x7B97;&#x6CD5;&#x6B65;&#x9AA4;&#xFF0C;&#x7F16;&#x5199;&#x4EE3;&#x7801;&#x5B9E;&#x73B0;&#x5BFB;&#x627E;&#x9891;&#x7E41;&#x9879;&#x96C6;
&#xFF08;1&#xFF09;&#x9996;&#x5148;&#xFF0C;&#x5B9A;&#x4E49;create_C1&#x51FD;&#x6570;&#xFF0C;&#x901A;&#x8FC7;&#x626B;&#x63CF;&#x6570;&#x636E;&#x96C6;&#x521B;&#x5EFA;&#x5927;&#x5C0F;&#x4E3A;1&#x7684;&#x6240;&#x6709;&#x5019;&#x9009;&#x9879;&#x96C6;&#x7684;&#x96C6;&#x5408;
def create_C1(data_set):
    C1 = set()
    for t in data_set:
        #*************** BEIGN ******************
        for item in t:
                    item_set = frozenset([item])  # frozenset: &#x521B;&#x5EFA;&#x4E0D;&#x53EF;&#x53D8;&#x96C6;&#x5408;
                    C1.add(item_set)
        #**************** END *******************
    return C1
&#xFF08;2&#xFF09;&#x5B9A;&#x4E49;is_apriori&#x51FD;&#x6570;&#x6765;&#x5224;&#x65AD;&#x5019;&#x9009;K&#x9879;&#x96C6;&#x662F;&#x5426;&#x6EE1;&#x8DB3;&#x5148;&#x9A8C;&#x539F;&#x7406;.&#x5176;&#x4E2D;&#xFF0C;CK_item&#x662F;CK&#x4E2D;&#x7684;&#x4E00;&#x4E2A;&#x5019;&#x9009;k&#x9879;&#x96C6;&#xFF1B;Lksub1&#x662F;&#x5305;&#x542B;&#x5168;&#x90E8;&#x7684;&#x9891;&#x7E41;(k-1)&#x9879;&#x96C6;&#x7684;&#x96C6;&#x5408;&#x3002;
&#x5982;&#x679C;&#x4E0D;&#x6EE1;&#x8DB3;&#x5148;&#x9A8C;&#x539F;&#x7406;&#xFF08;&#x5373;&#x4E00;&#x4E2A;&#x5019;&#x9009;k&#x9879;&#x96C6;Ck-item&#x7684;(k-1)&#x9879;&#x5B50;&#x96C6;&#x4E0D;&#x5728;Lksub1&#x4E2D;&#xFF09;&#xFF0C;&#x5219;&#x8FD4;&#x56DE;False&#xFF0C;&#x5982;&#x679C;&#x6EE1;&#x8DB3;&#x5219;&#x8FD4;&#x56DE;True&#x3002;
def is_apriori(Ck_item, Lksub1):
    for item in Ck_item:
        #*************** BEIGN ******************
        sub_Ck = Ck_item - frozenset([item])
        if sub_Ck not in Lksub1:
            return False
        #**************** END *******************
    return True
(3&#xFF09;&#x5B9A;&#x4E49;create_CK&#x51FD;&#x6570;&#xFF0C;&#x7531;&#x9891;&#x7E41;(k-1)&#x9879;&#x96C6;&#x7684;&#x96C6;&#x5408;Lksub1&#x7684;&#x81EA;&#x8EAB;&#x8FDE;&#x63A5;&#x4EA7;&#x751F;&#x5019;&#x9009;k&#x9879;&#x96C6;Ck&#x3002;
#&#x5176;&#x4E2D;&#xFF0C;&#x5BF9;&#x4E8E;&#x4E0D;&#x6EE1;&#x8DB3;&#x5148;&#x9A8C;&#x539F;&#x7406;&#x7684;Ck_item&#xFF0C;&#x8FDB;&#x884C;&#x526A;&#x679D;&#xFF0C;&#x5F97;&#x5230;&#x6700;&#x7EC8;&#x7684;&#x5019;&#x9009;K&#x9879;&#x96C6;&#x3002;&#x5176;&#x4E2D;&#xFF0C;Lksub1&#x5305;&#x542B;&#x9891;&#x7E41;(k-1)&#x9879;&#x96C6;&#x7684;&#x96C6;&#x5408;&#xFF0C;k&#x662F;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x9879;&#x6570;
apriori_gen(Fk&#x2212;1)apriori_gen(Fk&#x2212;1)
def create_Ck(Lksub1, k):  # ((A,B),(A,D),(C,D))
    Ck = set()
    len_Lksub1 = len(Lksub1)
    list_Lksub1 = list(Lksub1)  # [(A,B),(A,D),(C,D)]
    for i in range(len_Lksub1):
        for j in range(1, len_Lksub1):
            l1 = list(list_Lksub1[i])  # [A,B]
            l2 = list(list_Lksub1[j])  # [A,D]
            l1.sort()  # &#x6392;&#x5E8F;
            l2.sort()  # &#x6392;&#x5E8F;
            #**************** BEGIN *******************
            if l1[0:k-2] == l2[0:k-2]:  # &#x5982;&#x679C;list_Lksub1&#x4E2D;&#x7684;&#x524D;&#x4E00;&#x9879;&#x7B49;&#x4E8E;&#x5B83;&#x7684;&#x540E;&#x4E00;&#x9879;
                Ck_item = list_Lksub1[i] | list_Lksub1[j]  # (A,B) | (A,C)
                if is_apriori(Ck_item, Lksub1):  # &#x5BF9;&#x4E8E;&#x6EE1;&#x8DB3;&#x5148;&#x9A8C;&#x539F;&#x7406;&#xFF0C;&#x52A0;&#x5165;&#x9891;&#x7E41;&#x9879;&#x96C6;
                    Ck.add(Ck_item)
            #***************** END ********************
    return Ck  # ((A,B,C),(...))
(4) &#x5B9A;&#x4E49;generate_Lk_by_Ck&#x51FD;&#x6570;&#xFF0C;&#x901A;&#x8FC7;&#x4ECE;Ck&#x6267;&#x884C;&#x5220;&#x9664;&#x7B56;&#x7565;&#x6765;&#x751F;&#x6210;Lk&#xFF0C;&#x5373;&#x5BF9;Ck&#x4E2D;&#x7684;&#x6BCF;&#x4E2A;&#x9879;&#x8FDB;&#x884C;&#x8BA1;&#x6570;&#xFF0C;&#x7136;&#x540E;&#x5220;&#x9664;&#x4E0D;&#x6EE1;&#x8DB3;&#x6700;&#x5C0F;&#x652F;&#x6301;&#x5EA6;&#x7684;&#x9879;&#xFF0C;&#x4ECE;&#x800C;&#x83B7;&#x5F97;&#x9891;&#x7E41;k&#x9879;&#x96C6;&#x3002;Lk&#xFF1A;&#x5305;&#x542B;&#x6240;&#x6709;&#x9891;&#x7E41;k&#x9879;&#x96C6;&#x7684;&#x96C6;&#x5408;&#xFF1B;Ck&#xFF1A;&#x5305;&#x542B;&#x6240;&#x6709;&#x5019;&#x9009;k&#x9879;&#x96C6;&#x7684;&#x96C6;&#x5408;&#xFF1B;min_support&#xFF1A;&#x6700;&#x5C0F;&#x652F;&#x6301;&#x5EA6;&#xFF1B;support_data&#x662F;&#x4E2A;&#x5B57;&#x5178;&#xFF0C;&#x7528;&#x6765;&#x8BB0;&#x5F55;&#x6BCF;&#x4E2A;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x652F;&#x6301;&#x5EA6;&#x3002;&#xFF08;&#x7B97;&#x6CD5;&#x4E2D;&#x8BA1;&#x7B97;&#x652F;&#x6301;&#x5EA6;&#x8BA1;&#x6570;&#x7684;&#x90E8;&#x5206;&#xFF09;
def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
    Lk = set()
    item_count = {}
    #**************** BEGIN *******************
    for t in data_set:
        for item in Ck:
            if item.issubset(t):  # &#x5224;&#x65AD;&#x96C6;&#x5408;&#x7684;&#x6240;&#x6709;&#x5143;&#x7D20;&#x662F;&#x5426;&#x90FD;&#x5305;&#x542B;&#x5728;&#x6307;&#x5B9A;&#x96C6;&#x5408;&#x4E2D;&#xFF0C;&#x5982;&#x679C;&#x662F;&#x5219;&#x8FD4;&#x56DE; True&#xFF0C;&#x5426;&#x5219;&#x8FD4;&#x56DE; False&#x3002;
                # &#x66F4;&#x65B0;&#x9879;&#x96C6;&#x7684;&#x652F;&#x6301;&#x5EA6;&#x8BA1;&#x6570;
                if item not in item_count:
                    item_count[item] = 1
                else:
                    item_count[item] += 1
                #****************  END  *******************
    t_num = float(len(data_set))
    for item in item_count:
        if (item_count[item] / t_num) >= min_support:
            Lk.add(item)
            #**************** BEGIN *******************
            #&#x5B58;&#x50A8;&#x6BCF;&#x4E2A;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x652F;&#x6301;&#x5EA6;
            support_data[item] = item_count[item] / t_num
            #****************  END  *******************
    return Lk
&#xFF08;5&#xFF09;&#x5B9A;&#x4E49;generate_L&#x83B7;&#x53D6;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x3002;&#x53C2;&#x6570;k&#x662F;&#x6240;&#x6709;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x6700;&#x5927;&#x9879;&#x6570;&#xFF0C;min_support&#x662F;&#x6700;&#x5C0F;&#x652F;&#x6301;&#x5EA6;&#xFF0C;data_set&#x662F;&#x8981;&#x6316;&#x6398;&#x7684;&#x6570;&#x636E;&#x96C6;&#xFF08;&#x6574;&#x5408;&#x524D;&#x9762;&#x51E0;&#x4E2A;&#x51FD;&#x6570;&#xFF0C;&#x5F97;&#x5230;&#x9891;&#x7E41;&#x9879;&#x96C6;&#xFF09;
def generate_L(data_set, k, min_support):
    support_data = {}
    C1 = create_C1(data_set)  # &#x521B;&#x5EFA;&#x5019;&#x9009;1-&#x9879;&#x96C6;&#x7684;&#x96C6;&#x5408;
    L1 = generate_Lk_by_Ck(data_set, C1, min_support,
                           support_data)  # &#x4ECE;&#x5019;&#x9009;1-&#x9879;&#x96C6;&#x5F97;&#x5230;&#x9891;&#x7E41;1-&#x9879;&#x96C6;
    Lksub1 = L1.copy()  # &#x6D45;&#x590D;&#x5236;
    L = []
    L.append(Lksub1)
    #**************** BEGIN *******************
    for i in range(2, k+1):
        Ci = create_Ck(Lksub1, i)  # apriori_gen(Fk&#x2212;1)
        Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)  # &#x4ECE;&#x5019;&#x9009;i-&#x9879;&#x96C6;&#x5F97;&#x5230;&#x9891;&#x7E41;i-&#x9879;&#x96C6;
        Lksub1 = Li.copy()
        L.append(Lksub1)
    #**************** END ********************
    return L, support_data  # L&#x6240;&#x6709;&#x9891;&#x7E41;&#x9879;&#x96C6;&#xFF0C;support_data&#x6240;&#x6709;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x652F;&#x6301;&#x5EA6;
&#x8C03;&#x7528;&#x7F16;&#x5199;&#x7684;&#x51FD;&#x6570;&#xFF0C;&#x5BFB;&#x627E;&#x672C;&#x5B9E;&#x9A8C;&#x6570;&#x636E;&#x96C6;&#x4E2D;&#x7684;&#x9891;&#x7E41;&#x9879;&#x96C6;
def load_data(filename):
    data = list()
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            linestr = line.strip()
            linestrlist = linestr.split(",")
            data.append(linestrlist)
    return data
import fpGrowth as fp
def Task():
    data = load_data('data/apriori.txt')
    #**************** BEGIN *****************
    L, support_data = generate_L(data, k=3, min_support=0.06)
    #**************** END *******************
    sum = 0
    for Lk in L:  # Lk&#x662F;k-&#x9891;&#x7E41;&#x9879;&#x96C6;
        sum += len(Lk)
    len_number = sum  # &#x8F93;&#x51FA;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x603B;&#x6570;
    # &#x9A8C;&#x8BC1;&#x7ED3;&#x679C;&#x53EF;&#x9760;&#x6027;&#xFF0C;&#x5C06;FP&#x7B97;&#x6CD5;&#x7684;&#x7ED3;&#x679C;&#x4E0E;Apriori&#x7B97;&#x6CD5;&#x7ED3;&#x679C;&#x8FDB;&#x884C;&#x5BF9;&#x6BD4;
    # &#x4ECE;&#x4E4B;&#x524D;&#x7684;&#x5B9E;&#x9A8C;&#x7ED3;&#x679C;&#x53EF;&#x77E5;&#xFF0C;FP&#x7B97;&#x6CD5;&#x548C;Apriori&#x7B97;&#x6CD5;&#x5BFB;&#x627E;&#x7684;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x6570;&#x91CF;&#x662F;&#x76F8;&#x540C;&#x7684;&#xFF0C;&#x5747;&#x4E3A;209&#x4E2A;&#xFF0C;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x7684;&#x6700;&#x5927;&#x9879;&#x6570;&#x90FD;&#x662F;3&#x3002;
    # &#x6BD4;&#x8F83;FP&#x7B97;&#x6CD5;&#x548C;Apriori&#x7B97;&#x6CD5;&#x5F97;&#x5230;&#x7684;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x662F;&#x5426;&#x76F8;&#x540C;
    result = fp.getFrequentItemsets()
    q = set(frozenset(itemset) for itemset, supports in result)  # FP&#x6811;&#x7684;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x96C6;&#x5408;
    p = set(freqlist for Lk in L for freqlist in Lk)  # Apriori&#x7B97;&#x6CD5;&#x7684;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x96C6;&#x5408;
    return len_number, q,p

第四关:实现关联规则抽取

from apriori import *
&#x7F16;&#x5199;&#x4EE3;&#x7801;&#x5B9E;&#x73B0;&#x5173;&#x8054;&#x89C4;&#x5219;&#x62BD;&#x53D6;
dataset = load_data('data/apriori.txt')
&#x5B9A;&#x4E49; generate_big_rules&#x51FD;&#x6570;&#x6765;&#x83B7;&#x53D6;&#x5173;&#x8054;&#x89C4;&#x5219;
def generate_big_rules(L, support_data, min_conf):
    big_rule_list = []
    sub_set_list = []
    for i in range(0, len(L)):
        for freq_set in L[i]:  # freq_set:&#xFF08;'B4'&#xFF09;&#x3001;&#xFF08;'B4', 'C4', 'H4'&#xFF09;
            for sub_set in sub_set_list:
                #**************** BEGIN *****************
                if sub_set.issubset(freq_set):
                    # &#x8BA1;&#x7B97;&#x7F6E;&#x4FE1;&#x5EA6;
                    conf = support_data[freq_set] / support_data[freq_set - sub_set]
                    # &#x524D;&#x4EF6;&#x3001;&#x540E;&#x4EF6;&#x3001;&#x652F;&#x6301;&#x5EA6;&#x3001;&#x7F6E;&#x4FE1;&#x5EA6;
                    big_rule = (freq_set - sub_set, sub_set, support_data[freq_set], conf)
                    if conf >= min_conf and big_rule not in big_rule_list:
                        big_rule_list.append(big_rule)
                #**************** END *******************
            sub_set_list.append(freq_set)
    return big_rule_list
def task():
    L, support_data = generate_L(dataset, k=4, min_support=0.06)
    # &#x6839;&#x636E;&#x9891;&#x7E41;&#x9879;&#x96C6;&#x5BFB;&#x627E;&#x5173;&#x8054;&#x89C4;&#x5219;&#xFF0C;&#x8BBE;&#x7F6E;&#x7F6E;&#x4FE1;&#x5EA6;&#x4E3A; 0.75
    big_rules_list = generate_big_rules(L, support_data, min_conf=0.75)
    return big_rules_list

第五关:参数选择与关联规则结果分析

from apriori import *
&#x7F16;&#x5199;&#x4EE3;&#x7801;&#x5B9E;&#x73B0;&#x5173;&#x8054;&#x89C4;&#x5219;&#x62BD;&#x53D6;

def task():
    dataset = load_data('data/apriori.txt')
    #**************** BEGIN *****************
    L, support_data = generate_L(dataset, k=4, min_support=0.1)
    big_rules_list = generate_big_rules(L, support_data, min_conf=0.75)
    #**************** END *******************

    #**************** BEGIN *****************
    L, support_data = generate_L(dataset, k=4, min_support=0.06)
    big_rules_list_2 = generate_big_rules(L, support_data, min_conf=0.5)
    #**************** END *******************

    return big_rules_list, big_rules_list_2

Original: https://blog.csdn.net/m0_56494324/article/details/122056913
Author: 南风不竞呀
Title: Educoder关联规则挖掘

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/698487/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球