共 23125字,需浏览 47分钟
2021-03-04 00:05
无需查看训练数据集,就可以通过爬上测试集来做出完美的预测。 如何为分类和回归任务爬坡测试集。 当我们过度使用测试集来评估建模管道时,我们暗中爬升了测试集。
爬坡测试仪 爬山算法 如何进行爬山 爬坡糖尿病分类数据集 爬坡房屋回归数据集
# example of a synthetic dataset.
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
# define dataset
X, y = make_classification(n_samples=5000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
print(X.shape, y.shape)
# split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
(5000, 20) (5000,)
(3350, 20) (1650, 20) (3350,) (1650,)
# load or prepare the classification dataset
def load_dataset():
return make_classification(n_samples=5000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
# evaluate a set of predictions
def evaluate_predictions(y_test, yhat):
return accuracy_score(y_test, yhat)
函数生成0和1的随机值。# create a random set of predictions
def random_predictions(n_examples):
return [randint(0, 1) for _ in range(n_examples)]
# modify the current set of predictions
def modify_predictions(current, n_changes=1):
# copy current solution
updated = current.copy()
for i in range(n_changes):
# select a point to change
ix = randint(0, len(updated)-1)
# flip the class label
updated[ix] = 1 - updated[ix]
return updated
实现了此功能,将测试集作为输入并返回在爬坡过程中发现的最佳预测集。# run a hill climb for a set of predictions
def hill_climb_testset(X_test, y_test, max_iterations):
scores = list()
# generate the initial solution
solution = random_predictions(X_test.shape[0])
# evaluate the initial solution
score = evaluate_predictions(y_test, solution)
# hill climb to a solution
for i in range(max_iterations):
# record scores
# stop once we achieve the best score
if score == 1.0:
# generate new candidate
candidate = modify_predictions(solution)
# evaluate candidate
value = evaluate_predictions(y_test, candidate)
# check if it is as good or better
if value >= score:
solution, score = candidate, value
print('>%d, score=%.3f' % (i, score))
return solution, scores
# example of hill climbing the test set for a classification task
from random import randint
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
# load or prepare the classification dataset
def load_dataset():
return make_classification(n_samples=5000, n_features=20, n_informative=15, n_redundant=5, random_state=1)
# evaluate a set of predictions
def evaluate_predictions(y_test, yhat):
return accuracy_score(y_test, yhat)
# create a random set of predictions
def random_predictions(n_examples):
return [randint(0, 1) for _ in range(n_examples)]
# modify the current set of predictions
def modify_predictions(current, n_changes=1):
# copy current solution
updated = current.copy()
for i in range(n_changes):
# select a point to change
ix = randint(0, len(updated)-1)
# flip the class label
updated[ix] = 1 - updated[ix]
return updated
# run a hill climb for a set of predictions
def hill_climb_testset(X_test, y_test, max_iterations):
scores = list()
# generate the initial solution
solution = random_predictions(X_test.shape[0])
# evaluate the initial solution
score = evaluate_predictions(y_test, solution)
# hill climb to a solution
for i in range(max_iterations):
# record scores
# stop once we achieve the best score
if score == 1.0:
# generate new candidate
candidate = modify_predictions(solution)
# evaluate candidate
value = evaluate_predictions(y_test, candidate)
# check if it is as good or better
if value >= score:
solution, score = candidate, value
print('>%d, score=%.3f' % (i, score))
return solution, scores
# load the dataset
X, y = load_dataset()
print(X.shape, y.shape)
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# run hill climb
yhat, scores = hill_climb_testset(X_test, y_test, 20000)
# plot the scores vs iterations
>8092, score=0.996
>8886, score=0.997
>9202, score=0.998
>9322, score=0.998
>9521, score=0.999
>11046, score=0.999
>12932, score=1.000
# load or prepare the classification dataset
def load_dataset():
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv'
df = read_csv(url, header=None)
data = df.values
return data[:, :-1], data[:, -1]
# example of hill climbing the test set for the diabetes dataset
from random import randint
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
# load or prepare the classification dataset
def load_dataset():
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv'
df = read_csv(url, header=None)
data = df.values
return data[:, :-1], data[:, -1]
# evaluate a set of predictions
def evaluate_predictions(y_test, yhat):
return accuracy_score(y_test, yhat)
# create a random set of predictions
def random_predictions(n_examples):
return [randint(0, 1) for _ in range(n_examples)]
# modify the current set of predictions
def modify_predictions(current, n_changes=1):
# copy current solution
updated = current.copy()
for i in range(n_changes):
# select a point to change
ix = randint(0, len(updated)-1)
# flip the class label
updated[ix] = 1 - updated[ix]
return updated
# run a hill climb for a set of predictions
def hill_climb_testset(X_test, y_test, max_iterations):
scores = list()
# generate the initial solution
solution = random_predictions(X_test.shape[0])
# evaluate the initial solution
score = evaluate_predictions(y_test, solution)
# hill climb to a solution
for i in range(max_iterations):
# record scores
# stop once we achieve the best score
if score == 1.0:
# generate new candidate
candidate = modify_predictions(solution)
# evaluate candidate
value = evaluate_predictions(y_test, candidate)
# check if it is as good or better
if value >= score:
solution, score = candidate, value
print('>%d, score=%.3f' % (i, score))
return solution, scores
# load the dataset
X, y = load_dataset()
print(X.shape, y.shape)
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# run hill climb
yhat, scores = hill_climb_testset(X_test, y_test, 5000)
# plot the scores vs iterations
>617, score=0.961
>627, score=0.965
>650, score=0.969
>683, score=0.972
>743, score=0.976
>803, score=0.980
>817, score=0.984
>945, score=0.988
>1350, score=0.992
>1387, score=0.996
>1565, score=1.000
函数以加载住房数据集。作为加载数据集的一部分,我们将标准化目标值。由于我们可以将浮点值限制在0到1的范围内,这将使爬坡的预测更加简单。通常不需要这样做,只是此处采用的简化搜索算法的方法。# load or prepare the classification dataset
def load_dataset():
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = read_csv(url, header=None)
data = df.values
X, y = data[:, :-1], data[:, -1]
# normalize the target
scaler = MinMaxScaler()
y = y.reshape((len(y), 1))
y = scaler.fit_transform(y)
return X, y
# evaluate a set of predictions
def evaluate_predictions(y_test, yhat):
return mean_absolute_error(y_test, yhat)
# create a random set of predictions
def random_predictions(n_examples):
return [random() for _ in range(n_examples)]
# modify the current set of predictions
def modify_predictions(current, n_changes=1):
# copy current solution
updated = current.copy()
for i in range(n_changes):
# select a point to change
ix = randint(0, len(updated)-1)
# flip the class label
updated[ix] = random()
return updated
# add gaussian noise
updated[ix] += gauss(0, 0.1)
# stop once we achieve the best score
if score == 0.0:
# check if it is as good or better
if value <= score:
solution, score = candidate, value
print('>%d, score=%.3f' % (i, score))
# run a hill climb for a set of predictions
def hill_climb_testset(X_test, y_test, max_iterations):
scores = list()
# generate the initial solution
solution = random_predictions(X_test.shape[0])
# evaluate the initial solution
score = evaluate_predictions(y_test, solution)
print('>%.3f' % score)
# hill climb to a solution
for i in range(max_iterations):
# record scores
# stop once we achieve the best score
if score == 0.0:
# generate new candidate
candidate = modify_predictions(solution)
# evaluate candidate
value = evaluate_predictions(y_test, candidate)
# check if it is as good or better
if value <= score:
solution, score = candidate, value
print('>%d, score=%.3f' % (i, score))
return solution, scores
# example of hill climbing the test set for the housing dataset
from random import random
from random import randint
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot
# load or prepare the classification dataset
def load_dataset():
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
df = read_csv(url, header=None)
data = df.values
X, y = data[:, :-1], data[:, -1]
# normalize the target
scaler = MinMaxScaler()
y = y.reshape((len(y), 1))
y = scaler.fit_transform(y)
return X, y
# evaluate a set of predictions
def evaluate_predictions(y_test, yhat):
return mean_absolute_error(y_test, yhat)
# create a random set of predictions
def random_predictions(n_examples):
return [random() for _ in range(n_examples)]
# modify the current set of predictions
def modify_predictions(current, n_changes=1):
# copy current solution
updated = current.copy()
for i in range(n_changes):
# select a point to change
ix = randint(0, len(updated)-1)
# flip the class label
updated[ix] = random()
return updated
# run a hill climb for a set of predictions
def hill_climb_testset(X_test, y_test, max_iterations):
scores = list()
# generate the initial solution
solution = random_predictions(X_test.shape[0])
# evaluate the initial solution
score = evaluate_predictions(y_test, solution)
print('>%.3f' % score)
# hill climb to a solution
for i in range(max_iterations):
# record scores
# stop once we achieve the best score
if score == 0.0:
# generate new candidate
candidate = modify_predictions(solution)
# evaluate candidate
value = evaluate_predictions(y_test, candidate)
# check if it is as good or better
if value <= score:
solution, score = candidate, value
print('>%d, score=%.3f' % (i, score))
return solution, scores
# load the dataset
X, y = load_dataset()
print(X.shape, y.shape)
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# run hill climb
yhat, scores = hill_climb_testset(X_test, y_test, 100000)
# plot the scores vs iterations
)或对目标域有意义的值,则最好停止操作。这也留给读者作为练习。例如:# stop once we achieve a good enough
if score <= 1e-7:
>95991, score=0.001
>96011, score=0.001
>96295, score=0.001
>96366, score=0.001
>96585, score=0.001
>97575, score=0.001
>98828, score=0.001
>98947, score=0.001
>99712, score=0.001
>99913, score=0.001
Blog: http://yishuihancheng.blog.csdn.net
赞 赏 作 者