从概念都实例,手把手教你玩转数据挖掘
大数据DT
共 5667字,需浏览 12分钟
·
2021-06-23 22:39
导读:本文带了解什么是数据挖掘,并利用Python进行商品的亲和性分析。
创建数据集。数据集能直接反应一些真实事件; 选择算法。选择一个合适的算法才能更好的对数据进行处理; 优化算法。每种数据挖掘算法都有参数,它们或是算法自身包含的,或是使用者添加的,这些参数会影响算法的具体决策。
defaultdict(int):初始化为 0 defaultdict(float):初始化为 0.0 defaultdict(str):初始化为 ''
import numpy as np
from collections import defaultdict
dataset_filename = "affinity_dataset.txt"
features = ["bread","milk","cheese","apple","banana"] #猜一下这个是干嘛用的
X = np.loadtxt(dataset_filename)
print(X[:5]) #打印前五行的购物信息
num_apple_purchases = 0 # 初始化一个购买苹果人数的变量
for sample in X:
if sample[3] == 1:
num_apple_purchases+=1
print("{0} people bought Apples ".format(num_apple_purchases))
num_banana_purchases = 0
for sample in X:
if sample[4] == 1:
num_banana_purchases += 1
print("{0} people bought banana".format(num_banana_purchases))
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurances = defaultdict(int)
for sample in X:
for premise in range(4):
if sample[premise] ==0 :
continue
num_occurances[premise] +=1 #当顾客有购买物品时key对应的时value变为1
for conclusion in range(4):
if premise == conclusion: #访问同一个key 的时候是没有意义的直接跳过
continue
if sample[conclusion] == 1:
valid_rules[(premise,conclusion)] +=1
else:
invalid_rules[(premise,conclusion)] +=1
support = valid_rules
#置信度的计算方法类似,遍历每条规则进行计算
confidence = defaultdict(float)
for premise,conclusion in valid_rules.keys():
rule = (premise,conclusion)
confidence[rule] = valid_rules[rule]/num_occurances[premise]
def print_rule(premise, conclusion,support , confidence,features):
premise_name = features[premise]
conclusion_name = features[conclusion]
print("Rule:if a person buys {0} they will also buy {1} ".format(premise_name,conclusion_name))
print(" - Support : {0}".format(support[(premise,conclusion)]))
print(" - Confidence : {0:.3f}".format(confidence[(premise,conclusion)]))
premise = 1
conclusion = 3
features = ["bread","milk","cheese","apple","banana"]
print_rule(premise,conclusion,support,confidence,features)
from operator import itemgetter
sorted_support = sorted(support.items(),key=itemgetter(1),reverse=True)
for index in range(5):
print("Rule #{0}".format(index+1))
premise,conclusion = sorted_support[index][0]
print_rule(premise,conclusion,support,confidence,features)
评论