【机器学习基础】Scipy（科学计算库）简易入门-技术圈

0.导语

Scipy是一个用于数学、科学、工程领域的常用软件包，可以处理插值、积分、优化、图像处理、常微分方程数值解的求解、信号处理等问题。它用于有效计算Numpy矩阵，使Numpy和Scipy协同工作，高效解决问题。

Scipy是由针对特定任务的子模块组成：

模块名	应用领域
scipy.cluster	向量计算/Kmeans
scipy.constants	物理和数学常量
scipy.fftpack	傅立叶变换
scipy.integrate	积分程序
scipy.interpolate	插值
scipy.io	数据输入输出
scipy.linalg	线性代数程序
scipy.ndimage	n维图像包
scipy.odr	正交距离回归
scipy.optimize	优化
scipy.signal	信号处理
scipy.sparse	稀疏矩阵
scipy.spatial	空间数据结构和算法
scipy.special	一些特殊的数学函数
scipy.stats	统计

备注：本文代码可以在github下载

https://github.com/fengdu78/Data-Science-Notes/tree/master/4.scipy

1.SciPy-数值计算库

import numpy as np
import pylab as pl

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']

import scipy
scipy.__version__#查看版本

'1.0.0'

常数和特殊函数

from scipy import constants as C
print (C.c) # 真空中的光速
print (C.h) # 普朗克常数

299792458.0
6.62607004e-34

C.physical_constants["electron mass"]

(9.10938356e-31, 'kg', 1.1e-38)

# 1英里等于多少米, 1英寸等于多少米, 1克等于多少千克, 1磅等于多少千克
print(C.mile)
print(C.inch)
print(C.gram)
print(C.pound)

1609.3439999999998
0.0254
0.001
0.45359236999999997

import scipy.special as S

print (1 + 1e-20)
print (np.log(1+1e-20))
print (S.log1p(1e-20))

1.0
0.0
1e-20

m = np.linspace(0.1, 0.9, 4)
u = np.linspace(-10, 10, 200)
results = S.ellipj(u[:, None], m[None, :])

print([y.shape for y in results])

[(200, 4), (200, 4), (200, 4), (200, 4)]

#%figonly=使用广播计算得到的`ellipj()`返回值
fig, axes = pl.subplots(2, 2, figsize=(12, 4))
labels = ["$sn$", "$cn$", "$dn$", "$\phi$"]
for ax, y, label in zip(axes.ravel(), results, labels):
    ax.plot(u, y)
    ax.set_ylabel(label)
    ax.margins(0, 0.1)

axes[1, 1].legend(["$m={:g}$".format(m_) for m_ in m], loc="best", ncol=2);

2.拟合与优化-optimize

非线性方程组求解

import pylab as pl
import numpy as np

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']

from math import sin, cos
from scipy import optimize

def f(x): #❶
    x0, x1, x2 = x.tolist() #❷
    return [
        5*x1+3,
        4*x0*x0 - 2*sin(x1*x2),
        x1*x2 - 1.5
    ]

# f计算方程组的误差，[1,1,1]是未知数的初始值
result = optimize.fsolve(f, [1,1,1]) #❸
print (result)
print (f(result))

[-0.70622057 -0.6        -2.5       ]
[0.0, -9.126033262418787e-14, 5.329070518200751e-15]

def j(x):  #❶
    x0, x1, x2 = x.tolist()
    return [[0, 5, 0],
            [8 * x0, -2 * x2 * cos(x1 * x2), -2 * x1 * cos(x1 * x2)],
            [0, x2, x1]]


result = optimize.fsolve(f, [1, 1, 1], fprime=j)  #❷
print(result)
print(f(result))

[-0.70622057 -0.6        -2.5       ]
[0.0, -9.126033262418787e-14, 5.329070518200751e-15]

最小二乘拟合

import numpy as np
from scipy import optimize

X = np.array([ 8.19,  2.72,  6.39,  8.71,  4.7 ,  2.66,  3.78])
Y = np.array([ 7.01,  2.78,  6.47,  6.71,  4.1 ,  4.23,  4.05])

def residuals(p): #❶
    "计算以p为参数的直线和原始数据之间的误差"
    k, b = p
    return Y - (k*X + b)

# leastsq使得residuals()的输出数组的平方和最小，参数的初始值为[1,0]
r = optimize.leastsq(residuals, [1, 0]) #❷
k, b = r[0]
print ("k =",k, "b =",b)

k = 0.6134953491930442 b = 1.794092543259387

#%figonly=最小化正方形面积之和（左），误差曲面（右）
scale_k = 1.0
scale_b = 10.0
scale_error = 1000.0

def S(k, b):
    "计算直线y=k*x+b和原始数据X、Y的误差的平方和"
    error = np.zeros(k.shape)
    for x, y in zip(X, Y):
        error += (y - (k * x + b)) ** 2
    return error

ks, bs = np.mgrid[k - scale_k:k + scale_k:40j, b - scale_b:b + scale_b:40j]
error = S(ks, bs) / scale_error

from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import Rectangle

fig = pl.figure(figsize=(12, 5))

ax1 = pl.subplot(121)

ax1.plot(X, Y, "o")
X0 = np.linspace(2, 10, 3)
Y0 = k*X0 + b
ax1.plot(X0, Y0)

for x, y in zip(X, Y):
    y2 = k*x+b
    rect = Rectangle((x,y), abs(y-y2), y2-y, facecolor="red", alpha=0.2)
    ax1.add_patch(rect)

ax1.set_aspect("equal")


ax2 = fig.add_subplot(122, projection='3d')

ax2.plot_surface(
    ks, bs / scale_b, error, rstride=3, cstride=3, cmap="jet", alpha=0.5)
ax2.scatter([k], [b / scale_b], [S(k, b) / scale_error], c="r", s=20)
ax2.set_xlabel("$k$")
ax2.set_ylabel("$b$")
ax2.set_zlabel("$error$");

#%fig=带噪声的正弦波拟合
def func(x, p):  #❶
    """
    数据拟合所用的函数: A*sin(2*pi*k*x + theta)
    """
    A, k, theta = p
    return A * np.sin(2 * np.pi * k * x + theta)


def residuals(p, y, x):  #❷
    """
    实验数据x, y和拟合函数之间的差，p为拟合需要找到的系数
    """
    return y - func(x, p)


x = np.linspace(0, 2 * np.pi, 100)
A, k, theta = 10, 0.34, np.pi / 6  # 真实数据的函数参数
y0 = func(x, [A, k, theta])  # 真实数据
# 加入噪声之后的实验数据
np.random.seed(0)
y1 = y0 + 2 * np.random.randn(len(x))  #❸

p0 = [7, 0.40, 0]  # 第一次猜测的函数拟合参数

# 调用leastsq进行数据拟合
# residuals为计算误差的函数
# p0为拟合参数的初始值
# args为需要拟合的实验数据
plsq = optimize.leastsq(residuals, p0, args=(y1, x))  #❹

print(u"真实参数:", [A, k, theta])
print(u"拟合参数", plsq[0])  # 实验数据拟合后的参数

pl.plot(x, y1, "o", label=u"带噪声的实验数据")
pl.plot(x, y0, label=u"真实数据")
pl.plot(x, func(x, plsq[0]), label=u"拟合数据")
pl.legend(loc="best")

真实参数: [10, 0.34, 0.5235987755982988]
拟合参数 [10.25218748  0.3423992   0.50817423]

def func2(x, A, k, theta):
    return A*np.sin(2*np.pi*k*x+theta)

popt, _ = optimize.curve_fit(func2, x, y1, p0=p0)
print (popt)

[10.25218748  0.3423992   0.50817425]

popt, _ = optimize.curve_fit(func2, x, y1, p0=[10, 1, 0])

print(u"真实参数:", [A, k, theta])

print(u"拟合参数", popt)

真实参数: [10, 0.34, 0.5235987755982988]
拟合参数 [ 0.71093469  1.02074585 -0.12776742]

计算函数局域最小值

def target_function(x, y):
    return (1 - x)**2 + 100 * (y - x**2)**2


class TargetFunction(object):
    def __init__(self):
        self.f_points = []
        self.fprime_points = []
        self.fhess_points = []

    def f(self, p):
        x, y = p.tolist()
        z = target_function(x, y)
        self.f_points.append((x, y))
        return z

    def fprime(self, p):
        x, y = p.tolist()
        self.fprime_points.append((x, y))
        dx = -2 + 2 * x - 400 * x * (y - x**2)
        dy = 200 * y - 200 * x**2
        return np.array([dx, dy])

    def fhess(self, p):
        x, y = p.tolist()
        self.fhess_points.append((x, y))
        return np.array([[2 * (600 * x**2 - 200 * y + 1), -400 * x],
                         [-400 * x, 200]])


def fmin_demo(method):
    target = TargetFunction()
    init_point = (-1, -1)
    res = optimize.minimize(
        target.f,
        init_point,
        method=method,
        jac=target.fprime,
        hess=target.fhess)
    return res, [
        np.array(points) for points in (target.f_points, target.fprime_points,
                                        target.fhess_points)
    ]


methods = ("Nelder-Mead", "Powell", "CG", "BFGS", "Newton-CG", "L-BFGS-B")
for method in methods:
    res, (f_points, fprime_points, fhess_points) = fmin_demo(method)
    print(
        "{:12s}: min={:12g}, f count={:3d}, fprime count={:3d}, fhess count={:3d}"
        .format(method, float(res["fun"]), len(f_points), len(fprime_points),
                len(fhess_points)))

Nelder-Mead : min= 5.30934e-10, f count=125, fprime count=  0, fhess count=  0
Powell      : min=           0, f count= 52, fprime count=  0, fhess count=  0
CG          : min= 9.63056e-21, f count= 39, fprime count= 39, fhess count=  0
BFGS        : min= 1.84992e-16, f count= 40, fprime count= 40, fhess count=  0
Newton-CG   : min= 5.22666e-10, f count= 60, fprime count= 97, fhess count= 38
L-BFGS-B    : min=  6.5215e-15, f count= 33, fprime count= 33, fhess count=  0

#%figonly=各种优化算法的搜索路径
def draw_fmin_demo(f_points, fprime_points, ax):
    xmin, xmax = -3, 3
    ymin, ymax = -3, 3
    Y, X = np.ogrid[ymin:ymax:500j,xmin:xmax:500j]
    Z = np.log10(target_function(X, Y))
    zmin, zmax = np.min(Z), np.max(Z)
    ax.imshow(Z, extent=(xmin,xmax,ymin,ymax), origin="bottom", aspect="auto", cmap="gray")
    ax.plot(f_points[:,0], f_points[:,1], lw=1)
    ax.scatter(f_points[:,0], f_points[:,1], c=range(len(f_points)), s=50, linewidths=0)
    if len(fprime_points):
        ax.scatter(fprime_points[:, 0], fprime_points[:, 1], marker="x", color="w", alpha=0.5)
    ax.set_xlim(xmin, xmax)
    ax.set_ylim(ymin, ymax)

fig, axes = pl.subplots(2, 3, figsize=(9, 6))
methods = ("Nelder-Mead", "Powell", "CG", "BFGS", "Newton-CG", "L-BFGS-B")
for ax, method in zip(axes.ravel(), methods):
    res, (f_points, fprime_points, fhess_points) = fmin_demo(method)
    draw_fmin_demo(f_points, fprime_points, ax)
    ax.set_aspect("equal")
    ax.set_title(method)

计算全域最小值

def func(x, p):
    A, k, theta = p
    return A*np.sin(2*np.pi*k*x+theta)

def func_error(p, y, x):
    return np.sum((y - func(x, p))**2)

x = np.linspace(0, 2*np.pi, 100)
A, k, theta = 10, 0.34, np.pi/6
y0 = func(x, [A, k, theta])
np.random.seed(0)
y1 = y0 + 2 * np.random.randn(len(x))

result = optimize.basinhopping(func_error, (1, 1, 1),
                      niter = 10,
                      minimizer_kwargs={"method":"L-BFGS-B",
                                        "args":(y1, x)})
print (result.x)

[10.25218676 -0.34239909  2.63341581]

#%figonly=用`basinhopping()`拟合正弦波
pl.plot(x, y1, "o", label=u"带噪声的实验数据")
pl.plot(x, y0, label=u"真实数据")
pl.plot(x, func(x, result.x), label=u"拟合数据")
pl.legend(loc="best");

3.线性代数-linalg

解线性方程组

import pylab as pl
import numpy as np
from scipy import linalg

import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']

import numpy as np
from scipy import linalg
m, n = 500, 50
A = np.random.rand(m, m)
B = np.random.rand(m, n)
X1 = linalg.solve(A, B)
X2 = np.dot(linalg.inv(A), B)
print (np.allclose(X1, X2))
%timeit linalg.solve(A, B)
%timeit np.dot(linalg.inv(A), B)

True
5.38 ms ± 120 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
8.14 ms ± 994 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

luf = linalg.lu_factor(A)
X3 = linalg.lu_solve(luf, B)
np.allclose(X1, X3)

True

M, N = 1000, 100
np.random.seed(0)
A = np.random.rand(M, M)
B = np.random.rand(M, N)
Ai = linalg.inv(A)
luf = linalg.lu_factor(A)
%timeit linalg.inv(A)
%timeit np.dot(Ai, B)
%timeit linalg.lu_factor(A)
%timeit linalg.lu_solve(luf, B)

50.6 ms ± 1.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
3.49 ms ± 306 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
20.1 ms ± 1.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
4.49 ms ± 65 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

最小二乘解

from numpy.lib.stride_tricks import as_strided
def make_data(m, n, noise_scale):  #❶
    np.random.seed(42)
    x = np.random.standard_normal(m)
    h = np.random.standard_normal(n)
    y = np.convolve(x, h)
    yn = y + np.random.standard_normal(len(y)) * noise_scale * np.max(y)
    return x, yn, h

def solve_h(x, y, n):  #❷
    X = as_strided(
        x, shape=(len(x) - n + 1, n), strides=(x.itemsize, x.itemsize))  #❸
    Y = y[n - 1:len(x)]  #❹
    h = linalg.lstsq(X, Y)  #❺
    return h[0][::-1]  #❻

x, yn, h = make_data(1000, 100, 0.4)
H1 = solve_h(x, yn, 120)
H2 = solve_h(x, yn, 80)

print("Average error of H1:", np.mean(np.abs(h[:100] - h)))
print("Average error of H2:", np.mean(np.abs(h[:80] - H2)))

Average error of H1: 0.0
Average error of H2: 0.2958422158342371

#%figonly=实际的系统参数与最小二乘解的比较
fig, (ax1, ax2) = pl.subplots(2, 1, figsize=(6, 4))
ax1.plot(h, linewidth=2, label=u"实际的系统参数")
ax1.plot(H1, linewidth=2, label=u"最小二乘解H1", alpha=0.7)
ax1.legend(loc="best", ncol=2)
ax1.set_xlim(0, len(H1))
ax2.plot(h, linewidth=2, label=u"实际的系统参数")
ax2.plot(H2, linewidth=2, label=u"最小二乘解H2", alpha=0.7)
ax2.legend(loc="best", ncol=2)
ax2.set_xlim(0, len(H1));

特征值和特征向量

A = np.array([[1, -0.3], [-0.1, 0.9]])
evalues, evectors = linalg.eig(A)

print(evalues)
print(evectors)

[1.13027756+0.j 0.76972244+0.j]
[[ 0.91724574  0.79325185]
 [-0.3983218   0.60889368]]

#%figonly=线性变换将蓝色箭头变换为红色箭头
points = np.array([[0, 1.0], [1.0, 0], [1, 1]])

def draw_arrows(points, **kw):
    props = dict(color="blue", arrowstyle="->")
    props.update(kw)
    for x, y in points:
        pl.annotate("",
                    xy=(x, y), xycoords='data',
                    xytext=(0, 0), textcoords='data',
                    arrowprops=props)

draw_arrows(points)
draw_arrows(np.dot(A, points.T).T, color="red")
draw_arrows(evectors.T, alpha=0.7, linewidth=2)
draw_arrows(np.dot(A, evectors).T, color="red", alpha=0.7, linewidth=2)

ax = pl.gca()
ax.set_aspect("equal")
ax.set_xlim(-0.5, 1.1)
ax.set_ylim(-0.5, 1.1);

np.random.seed(42)
t = np.random.uniform(0, 2*np.pi, 60)

alpha = 0.4
a = 0.5
b = 1.0
x = 1.0 + a*np.cos(t)*np.cos(alpha) - b*np.sin(t)*np.sin(alpha)
y = 1.0 + a*np.cos(t)*np.sin(alpha) - b*np.sin(t)*np.cos(alpha)
x += np.random.normal(0, 0.05, size=len(x))
y += np.random.normal(0, 0.05, size=len(y))

D = np.c_[x**2, x*y, y**2, x, y, np.ones_like(x)]
A = np.dot(D.T, D)
C = np.zeros((6, 6))
C[[0, 1, 2], [2, 1, 0]] = 2, -1, 2
evalues, evectors = linalg.eig(A, C)     #❶
evectors = np.real(evectors)
err = np.mean(np.dot(D, evectors)**2, 0) #❷
p = evectors[:, np.argmin(err) ]         #❸
print (p)

[-0.55214278  0.5580915  -0.23809922  0.54584559 -0.08350449 -0.14852803]

#%figonly=用广义特征向量计算的拟合椭圆
def ellipse(p, x, y):
    a, b, c, d, e, f = p
    return a*x**2 + b*x*y + c*y**2 + d*x + e*y + f

X, Y = np.mgrid[0:2:100j, 0:2:100j]
Z = ellipse(p, X, Y)
pl.plot(x, y, "ro", alpha=0.5)
pl.contour(X, Y, Z, levels=[0]);

奇异值分解-SVD

r, g, b = np.rollaxis(pl.imread("vinci_target.png"), 2).astype(float)
img = 0.2989 * r + 0.5870 * g + 0.1140 * b
img.shape

(505, 375)

U, s, Vh = linalg.svd(img)
print(U.shape)
print(s.shape)
print(Vh.shape)

(505, 505)
(375,)
(375, 375)

#%fig=按从大到小排列的奇异值
pl.semilogy(s, lw=3);

def composite(U, s, Vh, n):
    return np.dot(U[:, :n], s[:n, np.newaxis] * Vh[:n, :])

print (np.allclose(img, composite(U, s, Vh, len(s))))

True

#%fig=原始图像、使用10、20、50个向量合成的图像（从左到右）
img10 = composite(U, s, Vh, 10)
img20 = composite(U, s, Vh, 20)
img50 = composite(U, s, Vh, 50)

%array_image img; img10; img20; img50

UsageError: Line magic function `%array_image` not found.

pl.imshow(img)

pl.imshow(img10)

pl.imshow(img20)

pl.imshow(img50)

4.统计-stats

import numpy as np
import pylab as pl
from scipy import stats

import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']

连续概率分布

from scipy import stats
[k for k, v in stats.__dict__.items() if isinstance(v, stats.rv_continuous)]

['ksone',
 'kstwobign',
 'norm',
 'alpha',
 'anglit',
 'arcsine',
 'beta',
 'betaprime',
 'bradford',
 'burr',
 'burr12',
 'fisk',
 'cauchy',
 'chi',
 'chi2',
 'cosine',
 'dgamma',
 'dweibull',
 'expon',
 'exponnorm',
 'exponweib',
 'exponpow',
 'fatiguelife',
 'foldcauchy',
 'f',
 'foldnorm',
 'weibull_min',
 'weibull_max',
 'frechet_r',
 'frechet_l',
 'genlogistic',
 'genpareto',
 'genexpon',
 'genextreme',
 'gamma',
 'erlang',
 'gengamma',
 'genhalflogistic',
 'gompertz',
 'gumbel_r',
 'gumbel_l',
 'halfcauchy',
 'halflogistic',
 'halfnorm',
 'hypsecant',
 'gausshyper',
 'invgamma',
 'invgauss',
 'invweibull',
 'johnsonsb',
 'johnsonsu',
 'laplace',
 'levy',
 'levy_l',
 'levy_stable',
 'logistic',
 'loggamma',
 'loglaplace',
 'lognorm',
 'gilbrat',
 'maxwell',
 'mielke',
 'kappa4',
 'kappa3',
 'nakagami',
 'ncx2',
 'ncf',
 't',
 'nct',
 'pareto',
 'lomax',
 'pearson3',
 'powerlaw',
 'powerlognorm',
 'powernorm',
 'rdist',
 'rayleigh',
 'reciprocal',
 'rice',
 'recipinvgauss',
 'semicircular',
 'skewnorm',
 'trapz',
 'triang',
 'truncexpon',
 'truncnorm',
 'tukeylambda',
 'uniform',
 'vonmises',
 'vonmises_line',
 'wald',
 'wrapcauchy',
 'gennorm',
 'halfgennorm',
 'crystalball',
 'argus']

stats.norm.stats()

(array(0.), array(1.))

X = stats.norm(loc=1.0, scale=2.0)
X.stats()

(array(1.), array(4.))

x = X.rvs(size=10000) # 对随机变量取10000个值
np.mean(x), np.var(x) # 期望值和方差

(1.0048352738823323, 3.9372117720073554)

stats.norm.fit(x) # 得到随机序列期望值和标准差

(1.0048352738823323, 1.984240855341749)

pdf, t = np.histogram(x, bins=100, normed=True)  #❶
t = (t[:-1] + t[1:]) * 0.5  #❷
cdf = np.cumsum(pdf) * (t[1] - t[0])  #❸
p_error = pdf - X.pdf(t)
c_error = cdf - X.cdf(t)
print ("max pdf error: {}, max cdf error: {}".format(
    np.abs(p_error).max(),
    np.abs(c_error).max()))

max pdf error: 0.018998755595167102, max cdf error: 0.018503342378306975

#%figonly=正态分布的概率密度函数（左）和累积分布函数（右）
fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7, 2))
ax1.plot(t, pdf, label=u"统计值")
ax1.plot(t, X.pdf(t), label=u"理论值", alpha=0.6)
ax1.legend(loc="best")
ax2.plot(t, cdf)
ax2.plot(t, X.cdf(t), alpha=0.6);

print(stats.gamma.stats(1.0))
print(stats.gamma.stats(2.0))

(array(1.), array(1.))
(array(2.), array(2.))

stats.gamma.stats(2.0, scale=2)

(array(4.), array(8.))

x = stats.gamma.rvs(2, scale=2, size=4)
x

array([4.40563983, 6.17699951, 3.65503843, 3.28052152])

stats.gamma.pdf(x, 2, scale=2)

array([0.12169605, 0.07037188, 0.14694352, 0.15904745])

X = stats.gamma(2, scale=2)
X.pdf(x)

array([0.12169605, 0.07037188, 0.14694352, 0.15904745])

离散概率分布

x = range(1, 7)
p = (0.4, 0.2, 0.1, 0.1, 0.1, 0.1)

dice = stats.rv_discrete(values=(x, p))
dice.rvs(size=20)

array([2, 5, 2, 6, 1, 6, 6, 5, 3, 1, 5, 2, 1, 1, 1, 1, 1, 2, 1, 6])

np.random.seed(42)
samples = dice.rvs(size=(20000, 50))
samples_mean = np.mean(samples, axis=1)

核密度估计

#%fig=核密度估计能更准确地表示随机变量的概率密度函数
_, bins, step = pl.hist(
    samples_mean, bins=100, normed=True, histtype="step", label=u"直方图统计")
kde = stats.kde.gaussian_kde(samples_mean)
x = np.linspace(bins[0], bins[-1], 100)
pl.plot(x, kde(x), label=u"核密度估计")
mean, std = stats.norm.fit(samples_mean)
pl.plot(x, stats.norm(mean, std).pdf(x), alpha=0.8, label=u"正态分布拟合")
pl.legend()

#%fig=`bw_method`参数越大核密度估计曲线越平滑
for bw in [0.2, 0.3, 0.6, 1.0]:
    kde = stats.gaussian_kde([-1, 0, 1], bw_method=bw)
    x = np.linspace(-5, 5, 1000)
    y = kde(x)
    pl.plot(x, y, lw=2, label="bw={}".format(bw), alpha=0.6)
pl.legend(loc="best");

二项、泊松、伽玛分布

stats.binom.pmf(range(6), 5, 1/6.0)

array([4.01877572e-01, 4.01877572e-01, 1.60751029e-01, 3.21502058e-02,
       3.21502058e-03, 1.28600823e-04])

#%fig=当n足够大时二项分布和泊松分布近似相等
lambda_ = 10.0
x = np.arange(20)

n1, n2 = 100, 1000

y_binom_n1 = stats.binom.pmf(x, n1, lambda_ / n1)
y_binom_n2 = stats.binom.pmf(x, n2, lambda_ / n2)
y_poisson = stats.poisson.pmf(x, lambda_)
print(np.max(np.abs(y_binom_n1 - y_poisson)))
print(np.max(np.abs(y_binom_n2 - y_poisson)))
#%hide
fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))

ax1.plot(x, y_binom_n1, label=u"binom", lw=2)
ax1.plot(x, y_poisson, label=u"poisson", lw=2, color="red")
ax2.plot(x, y_binom_n2, label=u"binom", lw=2)
ax2.plot(x, y_poisson, label=u"poisson", lw=2, color="red")
for n, ax in zip((n1, n2), (ax1, ax2)):
    ax.set_xlabel(u"次数")
    ax.set_ylabel(u"概率")
    ax.set_title("n={}".format(n))
    ax.legend()
fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1)

0.00675531110335309
0.0006301754049777564

#%fig=模拟泊松分布
np.random.seed(42)
def sim_poisson(lambda_, time):
    t = np.random.uniform(0, time, size=lambda_ * time)  #❶
    count, time_edges = np.histogram(t, bins=time, range=(0, time))  #❷
    dist, count_edges = np.histogram(
        count, bins=20, range=(0, 20), density=True)  #❸
    x = count_edges[:-1]
    poisson = stats.poisson.pmf(x, lambda_)
    return x, poisson, dist


lambda_ = 10
times = 1000, 50000
x1, poisson1, dist1 = sim_poisson(lambda_, times[0])
x2, poisson2, dist2 = sim_poisson(lambda_, times[1])
max_error1 = np.max(np.abs(dist1 - poisson1))
max_error2 = np.max(np.abs(dist2 - poisson2))
print("time={}, max_error={}".format(times[0], max_error1))
print("time={}, max_error={}".format(times[1], max_error2))
#%hide
fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))

ax1.plot(x1, dist1, "-o", lw=2, label=u"统计结果")
ax1.plot(x1, poisson1, "->", lw=2, label=u"泊松分布", color="red", alpha=0.6)
ax2.plot(x2, dist2, "-o", lw=2, label=u"统计结果")
ax2.plot(x2, poisson2, "->", lw=2, label=u"泊松分布", color="red", alpha=0.6)

for ax, time in zip((ax1, ax2), times):
    ax.set_xlabel(u"次数")
    ax.set_ylabel(u"概率")
    ax.set_title(u"time = {}".format(time))
    ax.legend(loc="lower center")

fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1)

time=1000, max_error=0.01964230201602718
time=50000, max_error=0.001798012894964722

#%fig=模拟伽玛分布
def sim_gamma(lambda_, time, k):
    t = np.random.uniform(0, time, size=lambda_ * time) #❶
    t.sort()  #❷
    interval = t[k:] - t[:-k] #❸
    dist, interval_edges = np.histogram(interval, bins=100, density=True) #❹
    x = (interval_edges[1:] + interval_edges[:-1])/2  #❺
    gamma = stats.gamma.pdf(x, k, scale=1.0/lambda_) #❺
    return x, gamma, dist

lambda_ = 10
time = 1000
ks = 1, 2
x1, gamma1, dist1 = sim_gamma(lambda_, time, ks[0])
x2, gamma2, dist2 = sim_gamma(lambda_, time, ks[1])
#%hide
fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))

ax1.plot(x1, dist1,  lw=2, label=u"统计结果")
ax1.plot(x1, gamma1, lw=2, label=u"伽玛分布", color="red", alpha=0.6)
ax2.plot(x2, dist2,  lw=2, label=u"统计结果")
ax2.plot(x2, gamma2, lw=2, label=u"伽玛分布", color="red", alpha=0.6)

for ax, k in zip((ax1, ax2), ks):
    ax.set_xlabel(u"时间间隔")
    ax.set_ylabel(u"概率密度")
    ax.set_title(u"k = {}".format(k))
    ax.legend(loc="upper right")

fig.subplots_adjust(0.1, 0.15, 0.95, 0.90, 0.2, 0.1);

T = 100000
A_count = int(T / 5)
B_count = int(T / 10)

A_time = np.random.uniform(0, T, A_count) #❶
B_time = np.random.uniform(0, T, B_count)

bus_time = np.concatenate((A_time, B_time)) #❷
bus_time.sort()

N = 200000
passenger_time = np.random.uniform(bus_time[0], bus_time[-1], N) #❸

idx = np.searchsorted(bus_time, passenger_time) #❹
np.mean(bus_time[idx] - passenger_time) * 60    #❺

202.3388747879705

np.mean(np.diff(bus_time)) * 60

199.99833251643057

#%figonly=观察者偏差
import matplotlib.gridspec as gridspec
pl.figure(figsize=(7.5, 3))

G = gridspec.GridSpec(10, 1)
ax1 = pl.subplot(G[:2,  0])
ax2 = pl.subplot(G[3:, 0])

ax1.vlines(bus_time[:10], 0, 1, lw=2, color="blue", label=u"公交车")
ptime = np.random.uniform(bus_time[0], bus_time[9], 100)
ax1.vlines(ptime, 0, 1, lw=1, color="red", alpha=0.2, label=u"乘客")
ax1.legend()
count, bins = np.histogram(passenger_time, bins=bus_time)
ax2.plot(np.diff(bins), count, ".", alpha=0.3, rasterized=True)
ax2.set_xlabel(u"公交车的时间间隔")
ax2.set_ylabel(u"等待人数");

from scipy import integrate
t = 10.0 / 3  # 两辆公交车之间的平均时间间隔
bus_interval = stats.gamma(1, scale=t)
n, _ = integrate.quad(lambda x: 0.5 * x * x * bus_interval.pdf(x), 0, 1000)
d, _ = integrate.quad(lambda x: x * bus_interval.pdf(x), 0, 1000)
n / d * 60

200.0

学生 t-分布与 t 检验

#%fig=模拟学生t-分布
mu = 0.0
n = 10
samples = stats.norm(mu).rvs(size=(100000, n))  #❶
t_samples = (np.mean(samples, axis=1) - mu) / np.std(
    samples, ddof=1, axis=1) * n**0.5  #❷
sample_dist, x = np.histogram(t_samples, bins=100, density=True)  #❸
x = 0.5 * (x[:-1] + x[1:])
t_dist = stats.t(n - 1).pdf(x)
print("max error:", np.max(np.abs(sample_dist - t_dist)))
#%hide
pl.plot(x, sample_dist, lw=2, label=u"样本分布")
pl.plot(x, t_dist, lw=2, alpha=0.6, label=u"t分布")
pl.xlim(-5, 5)
pl.legend(loc="best")

max error: 0.006832108369761447

#%figonly=当`df`增大，学生t-分布趋向于正态分布
fig, (ax1, ax2) = pl.subplots(1, 2, figsize=(7.5, 2.5))
ax1.plot(x, stats.t(6-1).pdf(x), label=u"df=5", lw=2)
ax1.plot(x, stats.t(40-1).pdf(x), label=u"df=39", lw=2, alpha=0.6)
ax1.plot(x, stats.norm.pdf(x), "k--", label=u"norm")
ax1.legend()

ax2.plot(x, stats.t(6-1).sf(x), label=u"df=5", lw=2)
ax2.plot(x, stats.t(40-1).sf(x), label=u"df=39", lw=2, alpha=0.6)
ax2.plot(x, stats.norm.sf(x), "k--", label=u"norm")
ax2.legend();

n = 30
np.random.seed(42)
s = stats.norm.rvs(loc=1, scale=0.8, size=n)

t = (np.mean(s) - 0.5) / (np.std(s, ddof=1) / np.sqrt(n))
print (t, stats.ttest_1samp(s, 0.5))

2.658584340882224 Ttest_1sampResult(statistic=2.658584340882224, pvalue=0.01263770225709123)

print ((np.mean(s) - 1) / (np.std(s, ddof=1) / np.sqrt(n)))
print (stats.ttest_1samp(s, 1), stats.ttest_1samp(s, 0.9))

-1.1450173670383303
Ttest_1sampResult(statistic=-1.1450173670383303, pvalue=0.26156414618801477) Ttest_1sampResult(statistic=-0.3842970254542196, pvalue=0.7035619103425202)

#%fig=红色部分为`ttest_1samp()`计算的p值
x = np.linspace(-5, 5, 500)
y = stats.t(n-1).pdf(x)
plt.plot(x, y, lw=2)
t, p = stats.ttest_1samp(s, 0.5)
mask = x > np.abs(t)
plt.fill_between(x[mask], y[mask], color="red", alpha=0.5)
mask = x < -np.abs(t)
plt.fill_between(x[mask], y[mask], color="red", alpha=0.5)
plt.axhline(color="k", lw=0.5)
plt.xlim(-5, 5);

from scipy import integrate
x = np.linspace(-10, 10, 100000)
y = stats.t(n-1).pdf(x)
mask = x >= np.abs(t)
integrate.trapz(y[mask], x[mask])*2

0.012633433707685974

m = 200000
mean = 0.5
r = stats.norm.rvs(loc=mean, scale=0.8, size=(m, n))
ts = (np.mean(s) - mean) / (np.std(s, ddof=1) / np.sqrt(n))
tr = (np.mean(r, axis=1) - mean) / (np.std(r, ddof=1, axis=1) / np.sqrt(n))
np.mean(np.abs(tr) > np.abs(ts))

0.012695

np.random.seed(42)

s1 = stats.norm.rvs(loc=1, scale=1.0, size=20)
s2 = stats.norm.rvs(loc=1.5, scale=0.5, size=20)
s3 = stats.norm.rvs(loc=1.5, scale=0.5, size=25)

print (stats.ttest_ind(s1, s2, equal_var=False)) #❶
print (stats.ttest_ind(s2, s3, equal_var=True))  #❷

Ttest_indResult(statistic=-2.2391470627176755, pvalue=0.033250866086743665)
Ttest_indResult(statistic=-0.5946698521856172, pvalue=0.5551805875810539)

卡方分布和卡方检验

#%fig=使用随机数验证卡方分布
a = np.random.normal(size=(300000, 4))
cs = np.sum(a**2, axis=1)

sample_dist, bins = np.histogram(cs, bins=100, range=(0, 20), density=True)
x = 0.5 * (bins[:-1] + bins[1:])
chi2_dist = stats.chi2.pdf(x, 4)
print("max error:", np.max(np.abs(sample_dist - chi2_dist)))
#%hide
pl.plot(x, sample_dist, lw=2, label=u"样本分布")
pl.plot(x, chi2_dist, lw=2, alpha=0.6, label=u"$\chi ^{2}$分布")
pl.legend(loc="best")

max error: 0.0030732520533635066

#%fig=模拟卡方分布
repeat_count = 60000
n, k = 100, 5

np.random.seed(42)
ball_ids = np.random.randint(0, k, size=(repeat_count, n)) #❶
counts = np.apply_along_axis(np.bincount, 1, ball_ids, minlength=k) #❷
cs2 = np.sum((counts - n/k)**2.0/(n/k), axis=1) #❸
k = stats.kde.gaussian_kde(cs2) #❹
x = np.linspace(0, 10, 200)
pl.plot(x, stats.chi2.pdf(x, 4), lw=2, label=u"$\chi ^{2}$分布")
pl.plot(x, k(x), lw=2, color="red", alpha=0.6, label=u"样本分布")
pl.legend(loc="best")
pl.xlim(0, 10);

def choose_balls(probabilities, size):
    r = stats.rv_discrete(values=(range(len(probabilities)), probabilities))
    s = r.rvs(size=size)
    counts = np.bincount(s)
    return counts

np.random.seed(42)
counts1 = choose_balls([0.18, 0.24, 0.25, 0.16, 0.17], 400)
counts2 = choose_balls([0.2]*5, 400)

print(counts1)
print(counts2)

[80 93 97 64 66]
[89 76 79 71 85]

chi1, p1 = stats.chisquare(counts1)
chi2, p2 = stats.chisquare(counts2)

print ("chi1 =", chi1, "p1 =", p1)
print ("chi2 =", chi2, "p2 =", p2)

chi1 = 11.375 p1 = 0.022657601239769634
chi2 = 2.55 p2 = 0.6357054527037017

#%figonly=卡方检验计算的概率为阴影部分的面积
x = np.linspace(0, 30, 200)
CHI2 = stats.chi2(4)
pl.plot(x, CHI2.pdf(x), "k", lw=2)
pl.vlines(chi1, 0, CHI2.pdf(chi1))
pl.vlines(chi2, 0, CHI2.pdf(chi2))
pl.fill_between(x[x>chi1], 0, CHI2.pdf(x[x>chi1]), color="red", alpha=1.0)
pl.fill_between(x[x>chi2], 0, CHI2.pdf(x[x>chi2]), color="green", alpha=0.5)
pl.text(chi1, 0.015, r"$\chi^2_1$", fontsize=14)
pl.text(chi2, 0.015, r"$\chi^2_2$", fontsize=14)
pl.ylim(0, 0.2)
pl.xlim(0, 20);

table = [[43, 9], [44, 4]]
chi2, p, dof, expected = stats.chi2_contingency(table)
print(chi2)
print(p)

1.0724852071005921
0.300384770390566

stats.fisher_exact(table)

(0.43434343434343436, 0.23915695682224306)

5.数值积分-integrate

import pylab as pl
import numpy as np
from scipy import integrate
from scipy.integrate import odeint
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']

球的体积

def half_circle(x):
    return (1-x**2)**0.5

N = 10000
x = np.linspace(-1, 1, N)
dx = x[1] - x[0]
y = half_circle(x)
2 * dx * np.sum(y) # 面积的两倍

3.1415893269307373

np.trapz(y, x) * 2 # 面积的两倍

3.1415893269315975

from scipy import integrate
pi_half, err = integrate.quad(half_circle, -1, 1)
pi_half * 2

3.141592653589797

def half_sphere(x, y):
    return (1-x**2-y**2)**0.5

volume, error = integrate.dblquad(half_sphere, -1, 1,
        lambda x:-half_circle(x),
        lambda x:half_circle(x))

print (volume, error, np.pi*4/3/2)

2.094395102393199 1.0002356720661965e-09 2.0943951023931953

解常微分方程组

#%fig=洛伦茨吸引子：微小的初值差别也会显著地影响运动轨迹
from scipy.integrate import odeint
import numpy as np

def lorenz(w, t, p, r, b): #❶
    # 给出位置矢量w，和三个参数p, r, b计算出
    # dx/dt, dy/dt, dz/dt的值
    x, y, z = w.tolist()
    # 直接与lorenz的计算公式对应
    return p*(y-x), x*(r-z)-y, x*y-b*z

t = np.arange(0, 30, 0.02) # 创建时间点
# 调用ode对lorenz进行求解, 用两个不同的初始值
track1 = odeint(lorenz, (0.0, 1.00, 0.0), t, args=(10.0, 28.0, 3.0)) #❷
track2 = odeint(lorenz, (0.0, 1.01, 0.0), t, args=(10.0, 28.0, 3.0)) #❸
#%hide
from mpl_toolkits.mplot3d import Axes3D
fig = pl.figure()
ax = Axes3D(fig)
ax.plot(track1[:,0], track1[:,1], track1[:,2], lw=1)
ax.plot(track2[:,0], track2[:,1], track2[:,2], lw=1);

ode 类

def mass_spring_damper(xu, t, m, k, b, F):
    x, u = xu.tolist()
    dx = u
    du = (F - k*x - b*u)/m
    return dx, du

#%fig=滑块的速度和位移曲线
m, b, k, F = 1.0, 10.0, 20.0, 1.0
init_status = 0.0, 0.0
args = m, k, b, F
t = np.arange(0, 2, 0.01)
result = odeint(mass_spring_damper, init_status, t, args)
#%hide
fig, (ax1, ax2) = pl.subplots(2, 1)
ax1.plot(t, result[:, 0], label=u"位移")
ax1.legend()
ax2.plot(t, result[:, 1], label=u"速度")
ax2.legend();

from scipy.integrate import ode

class MassSpringDamper(object): #❶

    def __init__(self, m, k, b, F):
        self.m, self.k, self.b, self.F = m, k, b, F

    def f(self, t, xu):
        x, u = xu.tolist()
        dx = u
        du = (self.F - self.k*x - self.b*u)/self.m
        return [dx, du]

system = MassSpringDamper(m=m, k=k, b=b, F=F)
init_status = 0.0, 0.0
dt = 0.01

r = ode(system.f) #❷
r.set_integrator('vode', method='bdf')
r.set_initial_value(init_status, 0)

t = []
result2 = [init_status]
while r.successful() and r.t + dt < 2: #❸
    r.integrate(r.t + dt)
    t.append(r.t)
    result2.append(r.y)

result2 = np.array(result2)
np.allclose(result, result2)

True

class PID(object):

    def __init__(self, kp, ki, kd, dt):
        self.kp, self.ki, self.kd, self.dt = kp, ki, kd, dt
        self.last_error = None
        self.status = 0.0

    def update(self, error):
        p = self.kp * error
        i = self.ki * self.status
        if self.last_error is None:
            d = 0.0
        else:
            d = self.kd * (error - self.last_error) / self.dt
        self.status += error * self.dt
        self.last_error = error
        return p + i + d

#%fig=使用PID控制器让滑块停在位移为1.0处
def pid_control_system(kp, ki, kd, dt, target=1.0):
    system = MassSpringDamper(m=m, k=k, b=b, F=0.0)
    pid = PID(kp, ki, kd, dt)
    init_status = 0.0, 0.0

    r = ode(system.f)
    r.set_integrator('vode', method='bdf')
    r.set_initial_value(init_status, 0)

    t = [0]
    result = [init_status]
    F_arr = [0]

    while r.successful() and r.t + dt < 2.0:
        r.integrate(r.t + dt)
        err = target - r.y[0]  #❶
        F = pid.update(err)  #❷
        system.F = F  #❸
        t.append(r.t)
        result.append(r.y)
        F_arr.append(F)

    result = np.array(result)
    t = np.array(t)
    F_arr = np.array(F_arr)
    return t, F_arr, result


t, F_arr, result = pid_control_system(50.0, 100.0, 10.0, 0.001)
print(u"控制力的终值:", F_arr[-1])
#%hide
fig, (ax1, ax2, ax3) = pl.subplots(3, 1, figsize=(6, 6))
ax1.plot(t, result[:, 0], label=u"位移")
ax1.legend(loc="best")
ax2.plot(t, result[:, 1], label=u"速度")
ax2.legend(loc="best")
ax3.plot(t, F_arr, label=u"控制力")
ax3.legend(loc="best")

控制力的终值: 19.943404699515057

%%time
from scipy import optimize


def eval_func(k):
    kp, ki, kd = k
    t, F_arr, result = pid_control_system(kp, ki, kd, 0.01)
    return np.sum(np.abs(result[:, 0] - 1.0))


kwargs = {
    "method": "L-BFGS-B",
    "bounds": [(10, 200), (10, 100), (1, 100)],
    "options": {
        "approx_grad": True
    }
}

opt_k = optimize.basinhopping(
    eval_func, (10, 10, 10), niter=10, minimizer_kwargs=kwargs)
print(opt_k.x)

[56.67106149 99.97434757  1.33963577]
Wall time: 55.1 s

#%fig=优化PID的参数降低控制响应时间
kp, ki, kd = opt_k.x
t, F_arr, result = pid_control_system(kp, ki, kd, 0.01)
idx = np.argmin(np.abs(t - 0.5))
x, u = result[idx]
print ("t={}, x={:g}, u={:g}".format(t[idx], x, u))
#%hide
fig, (ax1, ax2, ax3) = pl.subplots(3, 1, figsize=(6, 6))
ax1.plot(t, result[:, 0], label=u"位移")
ax1.legend(loc="best")
ax2.plot(t, result[:, 1], label=u"速度")
ax2.legend(loc="best")
ax3.plot(t, F_arr, label=u"控制力")
ax3.legend(loc="best");

t=0.5000000000000002, x=1.07098, u=0.315352

【机器学习基础】Scipy（科学计算库） 简易入门

0.导语

1.SciPy-数值计算库

常数和特殊函数

2.拟合与优化-optimize

非线性方程组求解

最小二乘拟合

计算函数局域最小值

计算全域最小值

3.线性代数-linalg

解线性方程组

最小二乘解

特征值和特征向量

奇异值分解-SVD

4.统计-stats

连续概率分布

离散概率分布

核密度估计

二项、泊松、伽玛分布

学生 t-分布与 t 检验

卡方分布和卡方检验

5.数值积分-integrate

球的体积

解常微分方程组

ode 类

6.信号处理-signal

中值滤波

滤波器设计

连续时间线性系统

7.插值-interpolate

一维插值

外推和 Spline 拟合

参数插值

单调插值

多维插值

griddata

径向基函数插值

8.稀疏矩阵-sparse

稀疏矩阵的储存形式

矩阵向量相乘

示例1

示例2

9.图像处理-ndimage

形态学图像处理

膨胀和腐蚀

Hit和Miss

图像分割

10.空间算法库-spatial

计算最近旁点

凸包

沃罗诺伊图

德劳内三角化

【机器学习基础】Scipy（科学计算库）简易入门