基于随机优化算法的特征选择
make_classification()
函数定义一个具有五个输入变量的数据集,其中两个是信息变量,并且有1,000行。下面的示例定义了数据集并总结了其形状。# define a small classification dataset
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=2, n_redundant=3, random_state=1)
# summarize the shape of the dataset
print(X.shape, y.shape)
(1000, 5) (1000,)
DecisionTreeClassifier
作为模型,因为它的性能对输入变量的选择非常敏感。我们将使用良好的实践来评估模型,例如具有三个重复和10折的重复分层k折交叉验证。下面列出了完整的示例。# evaluate a decision tree on the entire small dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=3, n_informative=2, n_redundant=1, random_state=1)
# define model
model = DecisionTreeClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
Mean Accuracy: 0.805 (0.030)
False
。例如,对于五个输入要素,序列[True,True,True,True,True]
将使用所有输入要素,而[True,False,False,False,False,False]
仅将第一个输入要素用作输入。我们可以使用product()
函数枚举length = 5
的所有布尔值序列。我们必须指定有效值[True,False]
和序列中的步数,该步数等于输入变量的数量。该函数返回一个可迭代的函数,我们可以直接为每个序列枚举。# determine the number of columns
n_cols = X.shape[1]
best_subset, best_score = None, 0.0
# enumerate all combinations of input features
for subset in product([True, False], repeat=n_cols):
# convert into column indexes
ix = [i for i, x in enumerate(subset) if x]
# check for now column (all False)
if len(ix) == 0:
continue
# select columns
X_new = X[:, ix]
# define model
model = DecisionTreeClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize scores
result = mean(scores)
# check if it is better than the best so far
if best_score is None or result >= best_score:
# better result
best_subset, best_score = ix, result
# feature selection by enumerating all possible subsets of features
from itertools import product
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=5, n_informative=2, n_redundant=3, random_state=1)
# determine the number of columns
n_cols = X.shape[1]
best_subset, best_score = None, 0.0
# enumerate all combinations of input features
for subset in product([True, False], repeat=n_cols):
# convert into column indexes
ix = [i for i, x in enumerate(subset) if x]
# check for now column (all False)
if len(ix) == 0:
continue
# select columns
X_new = X[:, ix]
# define model
model = DecisionTreeClassifier()
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=cv, n_jobs=-1)
# summarize scores
result = mean(scores)
# report progress
print('>f(%s) = %f ' % (ix, result))
# check if it is better than the best so far
if best_score is None or result >= best_score:
# better result
best_subset, best_score = ix, result
# report best
print('Done!')
print('f(%s) = %f' % (best_subset, best_score))
[2、3、4]
处的要素,这些要素的平均分类精度约为83.0%,这比以前使用所有输入要素报告的结果要好。>f([0, 1, 2, 3, 4]) = 0.813667
>f([0, 1, 2, 3]) = 0.827667
>f([0, 1, 2, 4]) = 0.815333
>f([0, 1, 2]) = 0.824000
>f([0, 1, 3, 4]) = 0.821333
>f([0, 1, 3]) = 0.825667
>f([0, 1, 4]) = 0.807333
>f([0, 1]) = 0.817667
>f([0, 2, 3, 4]) = 0.830333
>f([0, 2, 3]) = 0.819000
>f([0, 2, 4]) = 0.828000
>f([0, 2]) = 0.818333
>f([0, 3, 4]) = 0.830333
>f([0, 3]) = 0.821333
>f([0, 4]) = 0.816000
>f([0]) = 0.639333
>f([1, 2, 3, 4]) = 0.823667
>f([1, 2, 3]) = 0.821667
>f([1, 2, 4]) = 0.823333
>f([1, 2]) = 0.818667
>f([1, 3, 4]) = 0.818000
>f([1, 3]) = 0.820667
>f([1, 4]) = 0.809000
>f([1]) = 0.797000
>f([2, 3, 4]) = 0.827667
>f([2, 3]) = 0.755000
>f([2, 4]) = 0.827000
>f([2]) = 0.516667
>f([3, 4]) = 0.824000
>f([3]) = 0.514333
>f([4]) = 0.777667
Done!
f([0, 3, 4]) = 0.830333
# define a large classification dataset
from sklearn.datasets import make_classification
# define dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10, n_redundant=490, random_state=1)
# summarize the shape of the dataset
print(X.shape, y.shape)
(10000, 500) (10000,)
# evaluate a decision tree on the entire larger dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
# define dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10, n_redundant=490, random_state=1)
# define model
model = DecisionTreeClassifier()
# define evaluation procedure
cv = StratifiedKFold(n_splits=3)
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
Mean Accuracy: 0.913 (0.001)
Objective()
函数实现了此目的,并返回了得分和用于帮助报告的列的已解码子集。# objective function
def objective(X, y, subset):
# convert into column indexes
ix = [i for i, x in enumerate(subset) if x]
# check for now column (all False)
if len(ix) == 0:
return 0.0
# select columns
X_new = X[:, ix]
# define model
model = DecisionTreeClassifier()
# evaluate model
scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=3, n_jobs=-1)
# summarize scores
result = mean(scores)
return result, ix
mutate()
函数在给定的候选解决方案(布尔序列)和突变超参数的情况下实现了这一点,创建并返回了修改后的解决方案(搜索空间中的步骤)。p_mutate
值越大(在0到1的范围内),搜索空间中的步长越大。# mutation operator
def mutate(solution, p_mutate):
# make a copy
child = solution.copy()
for i in range(len(child)):
# check for a mutation
if rand() < p_mutate:
# flip the inclusion
child[i] = not child[i]
return child
# generate an initial point
solution = choice([True, False], size=X.shape[1])
# evaluate the initial point
solution_eval, ix = objective(X, y, solution)
# run the hill climb
for i in range(n_iter):
# take a step
candidate = mutate(solution, p_mutate)
# evaluate candidate point
candidate_eval, ix = objective(X, y, candidate)
# check if we should keep the new point
if candidate_eval >= solution_eval:
# store the new point
solution, solution_eval = candidate, candidate_eval
# report progress
print('>%d f(%s) = %f' % (i+1, len(ix), solution_eval))
hillclimbing()
函数以数据集,目标函数和超参数作为参数来实现此目的,并返回数据集列的最佳子集和模型的估计性能。# hill climbing local search algorithm
def hillclimbing(X, y, objective, n_iter, p_mutate):
# generate an initial point
solution = choice([True, False], size=X.shape[1])
# evaluate the initial point
solution_eval, ix = objective(X, y, solution)
# run the hill climb
for i in range(n_iter):
# take a step
candidate = mutate(solution, p_mutate)
# evaluate candidate point
candidate_eval, ix = objective(X, y, candidate)
# check if we should keep the new point
if candidate_eval >= solution_eval:
# store the new point
solution, solution_eval = candidate, candidate_eval
# report progress
print('>%d f(%s) = %f' % (i+1, len(ix), solution_eval))
return solution, solution_eval
# define dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10, n_redundant=490, random_state=1)
# define the total iterations
n_iter = 100
# probability of including/excluding a column
p_mut = 10.0 / 500.0
# perform the hill climbing search
subset, score = hillclimbing(X, y, objective, n_iter, p_mut)
# convert into column indexes
ix = [i for i, x in enumerate(subset) if x]
print('Done!')
print('Best: f(%d) = %f' % (len(ix), score))
# stochastic optimization for feature selection
from numpy import mean
from numpy.random import rand
from numpy.random import choice
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
# objective function
def objective(X, y, subset):
# convert into column indexes
ix = [i for i, x in enumerate(subset) if x]
# check for now column (all False)
if len(ix) == 0:
return 0.0
# select columns
X_new = X[:, ix]
# define model
model = DecisionTreeClassifier()
# evaluate model
scores = cross_val_score(model, X_new, y, scoring='accuracy', cv=3, n_jobs=-1)
# summarize scores
result = mean(scores)
return result, ix
# mutation operator
def mutate(solution, p_mutate):
# make a copy
child = solution.copy()
for i in range(len(child)):
# check for a mutation
if rand() < p_mutate:
# flip the inclusion
child[i] = not child[i]
return child
# hill climbing local search algorithm
def hillclimbing(X, y, objective, n_iter, p_mutate):
# generate an initial point
solution = choice([True, False], size=X.shape[1])
# evaluate the initial point
solution_eval, ix = objective(X, y, solution)
# run the hill climb
for i in range(n_iter):
# take a step
candidate = mutate(solution, p_mutate)
# evaluate candidate point
candidate_eval, ix = objective(X, y, candidate)
# check if we should keep the new point
if candidate_eval >= solution_eval:
# store the new point
solution, solution_eval = candidate, candidate_eval
# report progress
print('>%d f(%s) = %f' % (i+1, len(ix), solution_eval))
return solution, solution_eval
# define dataset
X, y = make_classification(n_samples=10000, n_features=500, n_informative=10, n_redundant=490, random_state=1)
# define the total iterations
n_iter = 100
# probability of including/excluding a column
p_mut = 10.0 / 500.0
# perform the hill climbing search
subset, score = hillclimbing(X, y, objective, n_iter, p_mut)
# convert into column indexes
ix = [i for i, x in enumerate(subset) if x]
print('Done!')
print('Best: f(%d) = %f' % (len(ix), score))
>80 f(240) = 0.918099
>81 f(236) = 0.918099
>82 f(238) = 0.918099
>83 f(236) = 0.918099
>84 f(239) = 0.918099
>85 f(240) = 0.918099
>86 f(239) = 0.918099
>87 f(245) = 0.918099
>88 f(241) = 0.918099
>89 f(239) = 0.918099
>90 f(239) = 0.918099
>91 f(241) = 0.918099
>92 f(243) = 0.918099
>93 f(245) = 0.918099
>94 f(239) = 0.918099
>95 f(245) = 0.918099
>96 f(244) = 0.918099
>97 f(242) = 0.918099
>98 f(238) = 0.918099
>99 f(248) = 0.918099
>100 f(238) = 0.918099
Done!
Best: f(239) = 0.918099
递归特征消除(RFE)用于Python中的特征选择
https://machinelearningmastery.com/rfe-feature-selection-in-python/
如何为机器学习选择特征选择方法
https://machinelearningmastery.com/feature-selection-with-real-and-categorical-data/
sklearn.datasets.make_classification API.
https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
itertools.product API.
https://docs.python.org/3/library/itertools.html#itertools.product
作者:沂水寒城,CSDN博客专家,个人研究方向:机器学习、深度学习、NLP、CV
Blog: http://yishuihancheng.blog.csdn.net
赞 赏 作 者
更多阅读
特别推荐
点击下方阅读原文加入社区会员
评论