物种分布密度的例子(球形空间KDE)

# -*- coding: UTF-8 -*-
from sklearn.datasets import fetch_species_distributions
from mpl_toolkits.basemap import Basemap
from sklearn.datasets.species_distributions import construct_grids
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity

data = fetch_species_distributions()
# 获取物种ID和位置矩阵、数组
latlon = np.vstack([data.train['dd lat'], data.train['dd long']]).T
print('latlon.shape: ', latlon.shape)
# .startswith('micro')检查字符串是否是以micro字符串开头, 返回布尔数组
species = np.array([d.decode('ascii').startswith('micro') for d in data.train['species']], dtype='int')
print('species.shape: ', species.shape)
xgrid, ygrid = construct_grids(data)  # 创建格子,供形成地理图的坐标用
m = Basemap(projection='cyl', resolution='c', llcrnrlat=ygrid.min(), urcrnrlat=ygrid.max(),
            llcrnrlon=xgrid.min(), urcrnrlon=xgrid.max())
m.drawmapboundary(fill_color='#DDEEFF')
m.fillcontinents(color='#FFEEDD')
m.drawcoastlines(color='gray', zorder=2)
m.drawcountries(color='gray', zorder=2)
m.scatter(latlon[:, 1], latlon[:, 0], zorder=3, c=species, cmap='rainbow', latlon=True)

# 用核密度估计对分布点进行概率分布(平滑化)
# 画轮廓图的数据点
X, Y = np.meshgrid(xgrid[::5], ygrid[::5][::-1])  # 每五个挑出(减小机器负担),ygrid最后还要倒序
# http://wiki.swarma.net/index.php?title=%E9%A2%84%E6%B5%8B%E7%89%A9%E7%A7%8D%E7%9A%84%E7%A9%BA%E9%97%B4%E5%88%86%E5%B8%83&variant=zh-mo
land_reference = data.coverages[6][::5, ::5]  # [6]代表用四月降水量信息来得到陆地信息(因为有情报的都是陆地),每五个挑出(减小机器负担)
land_mask = (land_reference > -9999).ravel()  # 矩阵中的缺失数据用-9999来表示(是海洋), 形成非缺失值的mask,即为陆地mask
xy = np.vstack([Y.ravel(), X.ravel()]).T
xy = np.radians(xy[land_mask])  # xy需要显示的行数被[land_mask]挑出来,度转换为弧
fig, ax = plt.subplots(1, 2)
fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05)
species_names = ['Bradypus Variegatus', 'Microryzomys Minutus']
cmaps = ['Purples', 'Reds']
for i, axi in enumerate(ax):
    axi.set_title(species_names[i])
    # 画地理图
    m = Basemap(projection='cyl', llcrnrlat=Y.min(), urcrnrlat=Y.max(), llcrnrlon=X.min(), urcrnrlon=X.max(),
                resolution='c', ax=axi)
    m.drawmapboundary(fill_color='#DDEEFF')
    m.drawcoastlines()
    m.drawcountries()
    # 构建球形分布核密度估计
    kde = KernelDensity(bandwidth=0.03, metric='haversine')  # 'haversine'衡量球面距离
    kde.fit(np.radians(latlon[species == i]))  # 获取各种类的地理位置信息,化为弧度值,埋入模型
    # 计算大陆值,-9999是海洋
    Z = np.full(land_mask.shape[0], -9999.0)  # np.full()生成land_mask.shape[0]形状,-9999的数组
    Z[land_mask] = np.exp(kde.score_samples(xy))  # score_samples()返回概率密度对数值,只计算陆地的,xy是陆地弧度值坐标
    Z = Z.reshape(X.shape)  # 化为X形状供显示
    # 画出密度轮廓
    levels = np.linspace(0, Z.max(), 25)  # Z.max()是最大密度估计
    # 输入的参数是x,y对应的网格数据以及此网格对应的高度值levels,之前先要调用np.meshgrid(x,y)把x,y值转换成网格数据
    axi.contourf(X, Y, Z, levels=levels, cmap=cmaps[i])
plt.show()

核密度估计求带宽

# -*- coding: UTF-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
sns.set()


def make_data(N, f=0.3, rseed=1):
rand = np.random.RandomState(rseed)
x = rand.randn(N)
x[int(f * N):] += 5 # f * N以后全部增加5
return x


x = make_data(20)
x_d = np.linspace(-4, 8, 1000)
# 初始化并埋入数据
kde = KernelDensity(bandwidth=1.0, kernel='gaussian')
kde.fit(x[:, None])
# score_samples 返回概率密度的对数值

logprob = kde.score_samples(x_d[:, None])
plt.fill_between(x_d, np.exp(logprob), alpha=0.5)
plt.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.ylim(-0.02, 0.22)

bandwidths = 10 ** np.linspace(-1, 1, 100)
grid = GridSearchCV(KernelDensity(kernel='gaussian'), {'bandwidth': bandwidths}, cv=LeaveOneOut())
grid.fit(x[:, None])
print('grid.best_params_ is: ', grid.best_params_)
plt.show()

高斯内核实现的核密度估计

# -*- coding: UTF-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm # 可以实现正态分布
sns.set()


def make_data(N, f=0.3, rseed=1):
rand = np.random.RandomState(rseed)
x = rand.randn(N)
x[int(f * N):] += 5 # f * N以后全部增加5
return x


x = make_data(20)
'''
x_d = np.linspace(-4, 8, 2000)
# 只要距离在0.5以内则被计数,以此作为密度
density = sum((abs(xi - x_d) < 0.5) for xi in x) # 计算x_d里的每一个刻度到x的距离小于0.5为真的数量
plt.fill_between(x_d, density, alpha=0.5) # x_d对应一个density值,对着X轴填色
plt.plot(x, np.full_like(x, -0.1), '|k', markeredgewidth=1)
plt.axis([-4, 8, -0.2, 8]) # X轴范围,Y轴范围
'''
x_d = np.linspace(-4, 8, 1000)
density = sum(norm(xi).pdf(x_d) for xi in x) # 由xi处的正态分布构成的在x_d范围内的概率密度函数
plt.fill_between(x_d, density, alpha=0.5)
plt.plot(x, np.full_like(x, -0.1), '|k', markeredgewidth=1)
plt.axis([-4, 8, -0.2, 5])
plt.show()


方块堆叠的频次直方图

# -*- coding: UTF-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


def make_data(N, f=0.3, rseed=1):
rand = np.random.RandomState(rseed)
x = rand.randn(N)
x[int(f * N):] += 5 # f * N以后全部增加5
return x


x = make_data(1000)
hist = plt.hist(x, bins=30, density=True)
density, bins, patches = hist
print('density:\n ', density)
print('bins:\n ', bins)
print('patches:\n ', patches)
widths = bins[1:] - bins[:-1]
print('(density * widths).sum() is: ', (density * widths).sum())

x2 = make_data(20)
bins = np.linspace(-5, 10, 10)
fig, ax = plt.subplots(1, 2, figsize=(12, 4), sharex=True, sharey=True, subplot_kw={'xlim': (-4, 9), 'ylim': (-0.02, 0.3)})
fig.subplots_adjust(wspace=0.05)
for i, offset in enumerate([0.0, 0.6]):
print('bins + offset:\n', bins + offset)
ax[i].hist(x2, bins=bins + offset, density=True) # 指定bins区间
# np.full_like(): 返回与给定数组具有相同形状和类型的数组。并且数组中元素的值是fill_value的值
ax[i].plot(x2, np.full_like(x2, -0.01), '|k', markeredgewidth=1) # markeredgewidth:竖线条宽度; |k: 黑色竖线
print('x2:\n', x2)

fig2, ax2 = plt.subplots()
bins = np.arange(-3, 8)
ax2.plot(x2, np.full_like(x2, -0.1), '|k', markeredgewidth=1)
for count, edge in zip(*np.histogram(x2, bins)): # 打包把组合里各自元素赋予count和edge
print('np.histogram(x2, bins):\n', np.histogram(x2, bins)) # 返回2个值,第一个是各个区间里的个数,第二个是区间段被刻度值隔出来的刻度值
for i in range(count): # 区间内有几个元素就画几个矩形
ax2.add_patch(plt.Rectangle((edge, i), 1, 1, alpha=0.5)) # 是矩形左下角坐标,宽,高,透明度
ax2.set_xlim(-4, 8)
ax2.set_ylim(-0.2, 8)
plt.show()


GMM高斯混合模型的可视化

# -*- coding: UTF-8 -*-
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.datasets.samples_generator import make_blobs
from matplotlib.patches import Ellipse # 画椭圆的类

sns.set()


def draw_ellipse(position, covariance, ax=None, **kwargs):
# 做椭圆需要协方差和位置信息
ax = ax or plt.gca()
# 协方差换成主轴
if covariance.shape == (2, 2):
# 分解奇异值
print('covariance:\n', covariance)
# https://blog.csdn.net/rainpasttime/article/details/79831533?utm_source=app
U, s, Vt = np.linalg.svd(covariance) # compute_uv的取值是为0或者1,默认值为1,表示计算u,s,v。为0的时候只计算s
print('s:\n', s)
print('Vt:\n', Vt)
# arctan2(y, x) 是该点(x, y) 对应原点形成的三角形,求反正切对应的弧度数(过原点和x轴的那个角)
# 控制椭圆罩住的角度
angle = np.degrees(np.arctan2(U[1, 0], U[0, 0])) # 弧度换为角度,atan2 的用法一样(用C库所以更快)
print('angle is:', angle)
print('U:\n', U)
print('U[1, 0], U[0, 0]):\n', U[1, 0], U[0, 0])
width, height = 2 * np.sqrt(s) # 奇异值分配出去,椭圆大小
else: # 若不是2*2矩阵
angle = 0
width, height = 2 * np.sqrt(covariance) # 就用自己
# 画椭圆
for nsig in range(1, 4): # 遍历椭圆内概率密度分层的数量
# 绘制椭圆,第二个参数是椭圆的长直径,第三个是短直径,angle是逆时针旋转角度
ax.add_patch(Ellipse(position, nsig * width, nsig * height, angle, **kwargs))


def plot_gmm(gmm, X, label=True, ax=None):
ax = ax or plt.gca()
labels = gmm.fit(X).predict(X)
if label:
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
else:
ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
ax.axis('equal')
w_factor = 0.2 / gmm.weights_.max()
print('\ngmm.means_:\n', gmm.means_, '\ngmm.covariances_:\n', gmm.covariances_, '\ngmm.weights_:\n', gmm.weights_)
for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_): # 画出椭圆
print('w * w_factor:', w * w_factor)
draw_ellipse(pos, covar, alpha=w * w_factor) # alpha通过**kwargs传入


X, y_true = make_blobs(n_samples=400, centers=4, cluster_std=0.60, random_state=0)
X = X[:, ::-1]
rng = np.random.RandomState(13)
X_stretched = np.dot(X, rng.randn(2, 2)) # 叉乘投影 (400,2) X (2,2),变成椭圆
# gmm = GaussianMixture(n_components=4).fit(X)
# gmm = GaussianMixture(n_components=4, covariance_type='full', random_state=42).fit(X)
gmm = GaussianMixture(n_components=4, random_state=42).fit(X_stretched)
labels = gmm.predict(X)
probs = gmm.predict_proba(X)
# print(probs[:5].round(3))
size = 50 * probs.max(1) ** 2
# plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=size)
plot_gmm(gmm, X_stretched)
plt.show()

用K最近邻色彩压缩

# -*- coding: UTF-8 -*-
from sklearn.datasets import load_sample_image
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import MiniBatchKMeans


def plot_pixels(data, title, colors=None, N=10000): # 前1w个像素
if colors is None:
colors = data
# 随机选择子集
rng = np.random.RandomState(0)

i = rng.permutation(data.shape[0])[:N] # 随机选1万个序号
colors = colors[i] # 随机选1w个像素点
# print(colors)
R, G, B = data[i].T # 把R,G,B的值们找出来
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
ax[0].scatter(R, G, color=colors, marker='.')
ax[0].set(xlabel='Red', ylabel='Green', xlim=(0, 1), ylim=(0, 1))
ax[1].scatter(R, B, color=colors, marker='.')
ax[1].set(xlabel='Red', ylabel='Blue', xlim=(0, 1), ylim=(0, 1))
fig.suptitle(title, size=20)


china = load_sample_image("china.jpg")
# ax1 = plt.axes(xticks=[], yticks=[])
# ax1.imshow(china)
print('china.shape: ', china.shape)
# print('china: ', china)
data = china / 255.0 # 化为0-1区间
data = data.reshape(427 * 640, 3) # 化成n_samplesXn_features
print('data.shape: ', data.shape)
# plot_pixels(data, title='Input color space: 16 million possible colors') # RGB颜色空间中的像素分布
kmeans = MiniBatchKMeans(16) # 更快的算法计算数据子集
kmeans.fit(data)
print('kmeans.predict(data).shape:', kmeans.predict(data).shape) # 降维
# print('kmeans.predict(data):', kmeans.predict(data)) # 降维
# 降维后的kmeans.predict(data)这个列表的各个数字代表kmeans.cluster_centers_簇中心(一共16个色簇)的需要提出来拼接的行索引
new_colors = kmeans.cluster_centers_[kmeans.predict(data)]
print('kmeans.cluster_centers_.shape:', kmeans.cluster_centers_.shape) # 簇中心,这16个代表色彩

print('new_colors.shape:', new_colors.shape)
plot_pixels(data, colors=new_colors, title="Reduced color space: 16 colors")

china_recolored = new_colors.reshape(china.shape) # 在原来的图像空间china.shape:(427*640*3)上还原成原始尺寸
fig2, ax2 = plt.subplots(1, 2, figsize=(16, 6), subplot_kw=dict(xticks=[], yticks=[]))
fig2.subplots_adjust(wspace=0.05)
ax2[0].imshow(china)
ax2[0].set_title('Original Image', size=16)
ax2[1].imshow(china_recolored)
ax2[1].set_title('16-color Image', size=16)
plt.show()
"""
# 按照b的顺序用a的行来组成新的矩阵
a = np.mat([[1, 2, 3],
[4, 5, 6]])
b = np.array([1, 1, 0, 0, 0, 1, 0])
c = a[b]
print('c:\n', c)
"""

K最近邻数字聚类

# -*- coding: utf-8 -*
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.stats import mode
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.manifold import TSNE # 非线性嵌入算法模块

digits = load_digits()
# print(digits.data.shape)
kmeans = KMeans(n_clusters=10, random_state=0)
clusters = kmeans.fit_predict(digits.data)
# print(kmeans.cluster_centers_.shape)

fig, ax = plt.subplots(2, 5, figsize=(8, 3))
centers = kmeans.cluster_centers_.reshape(10, 8, 8)
for axi, center in zip(ax.flat, centers):
axi.set(xticks=[], yticks=[])
axi.imshow(center, interpolation='nearest', cmap=plt.cm.binary)

labels = np.zeros_like(clusters) # 建立类似clusters的全0数组
# print('clusters:\n', clusters)
# print('clusters.shape:\n', clusters.shape)
# print('labels:\n', labels)
for i in range(10): # 纠正labels,让每个簇估计出的数字服从出现最大概率的数字
mask = (clusters == i) # 对于等于某个数字的簇形成的mask
# print('mask:\n', mask)
# print('digits.target[mask]:\n', digits.target[mask]) # 筛选出mask罩出的判断值
# 用mask选出并纠正labels里每个簇里的各个值,化为target里被mask罩出(估计)的大多数值一样
labels[mask] = mode(digits.target[mask])[0] # mode(): 返回传入数组/矩阵中最常出现的成员以及出现的次数,如果多个成员出现次数一样多,返回值小的那个
# print('labels[mask]:\n', labels[mask]) # 改变后
print('accuracy_score is: ', accuracy_score(digits.target, labels))
fig2, ax2 = plt.subplots()
mat = confusion_matrix(digits.target, labels)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels=digits.target_names,
yticklabels=digits.target_names)
plt.xlabel('true label')
plt.ylabel('predicted label')

# 预处理,投影数据到二维
# 初始化,默认为random。取值为random为随机初始化,取值为pca为利用PCA进行初始化(常用),取值为numpy数组时必须shape=(n_samples, n_components)
tsne = TSNE(n_components=2, init='pca', random_state=0)
digits_proj = tsne.fit_transform(digits.data)
# 计算簇
kmeans = KMeans(n_clusters=10, random_state=0)
clusters = kmeans.fit_predict(digits_proj)
# 对应统一标签
labels = np.zeros_like(clusters)
for i in range(10):
mask = (clusters == i)
labels[mask] = mode(digits.target[mask])[0]
print('accuracy_score is: ', accuracy_score(digits.target, labels))
plt.show()

PCA特征脸

# -*- coding: UTF-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from sklearn.datasets import fetch_lfw_people
from sklearn.decomposition import PCA
seaborn.set()

faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape) # 1348张图片,62*47矩阵供显示
print(faces.data.shape) # 1348张图,每张图2914特征维度
pca = PCA(150) # 随机前150个主成分
pca.fit(faces.data)
fig, axes = plt.subplots(3, 8, figsize=(9, 4), subplot_kw={'xticks': [], 'yticks': []},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i, ax in enumerate(axes.flat):
ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone') # 展示每个特诊维度的综合画像
print('pca.components_:\n', pca.components_.shape) # 150个特征,2914张照片
fig2 = plt.figure()
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

pca = PCA(150).fit(faces.data)
components = pca.transform(faces.data)
projected = pca.inverse_transform(components)
fig, ax = plt.subplots(2, 10, figsize=(10, 2.5),
subplot_kw={'xticks': [], 'yticks': []},
gridspec_kw=dict(hspace=0.1, wspace=0.1))
for i in range(10):
ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='binary_r')
ax[1, i].imshow(projected[i].reshape(62, 47), cmap='binary_r')
ax[0, 0].set_ylabel('full-dim\ninput')
ax[1, 0].set_ylabel('150-dim\nreconstruction')
plt.show()

k最近邻算法引用

# -*- coding: utf-8 -*-
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans

sns.set()
X, y_true = make_blobs(n_samples=300, centers=4,
cluster_std=0.60, random_state=0)
# plt.scatter(X[:, 0], X[:, 1], s=50)
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_kmeans = kmeans.predict(X) # 返回颜色信息列表
# print('y_kmeans:\n', y_kmeans)
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, s=50, cmap='viridis')
centers = kmeans.cluster_centers_
print('centers:\n', centers)
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)

plt.show()
通过 WordPress.com 设计一个这样的站点
从这里开始