用KDE做贝叶斯生成分类

# -*- coding: UTF-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KernelDensity
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score


class KDEClassifier(BaseEstimator, ClassifierMixin): # 基于KDE的贝叶斯生成分类
# bandwidth是每个类中的核带宽(float);kernel是核函数名称
def __init__(self, bandwidth=1.0, kernel='gaussian'):
self.bandwidth = bandwidth
self.kernel = kernel

def fit(self, X, y):
self.classes_ = np.sort(np.unique(y)) # np.unique():去除数组中的重复数字,并进行排序之后输出,小到大
print('X:\n', X)
print('y:\n', y)
print('self.classes_:\n', self.classes_)
# 在训练数据集中找到所有标签类,去掉重复标签
training_sets = [X[y == yi] for yi in self.classes_] # 按y里的元素从小到大的顺序遍历X的行数,行索引号由出现y里元素index号决定
# 为每个类(各traning_set)训练一个KernelDensity模型
self.models_ = [KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel).fit(Xi) for Xi in training_sets]
# 计算类的先验概率,某类标签yi时总计概率Xi/X
self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0]) for Xi in training_sets]
return self

# 预测新数据概率
def predict_proba(self, X):
# 得到每个类别的概率密度估计值形成的数组shape:[n_samples*n_classes],这里数组中[i,j]表示样本i属于j类的后验概率
logprobs = np.array([model.score_samples(X) for model in self.models_]).T # 这里因为是矩阵,所以np.array和np.vstack都是一样效果
# 数字0这个训练模型训练出来的概率密度,元素数量取决于带宽
print('self.models_[0].score_samples(X).shape:\n', self.models_[0].score_samples(X).shape)
# 模型不同预测所需要的n_samples的特征数组也不一样, 359可以是357等等,有的模型是预测0,有的模型是预测1
print('logprobs.shape:\n', logprobs.shape) # (359, 10)之类
print('logprobs:\n', logprobs)
# 各自取指数还原,然后乘以先验概率
result = np.exp(logprobs + self.logpriors_)
# 返回归一化后的result统一单位
return result / result.sum(1, keepdims=True)

def predict(self, X):
# 返回拥有最大概率的类别
print('self.classes_[np.argmax(self.predict_proba(X), 1)]:\n', self.classes_[np.argmax(self.predict_proba(X), 1)])
return self.classes_[np.argmax(self.predict_proba(X), 1)] # np.argmax():返回最大值索引, 0代表跨行查找最大(即列), 1代表跨列查找最大(即行)


digits = load_digits()
print('digits.data.shape:\n', digits.data.shape)
bandwidths = 10 ** np.linspace(0, 2, 100)
grid = GridSearchCV(KDEClassifier(), {'bandwidth': bandwidths}, cv=5, iid=False) # , cv=5, iid=False
grid.fit(digits.data, digits.target)

# print("grid.cv_results_['mean_test_score']:\n", grid.cv_results_['mean_test_score'])
scores = [val for val in grid.cv_results_['mean_test_score']]
plt.semilogx(bandwidths, scores) # 把x轴用对数刻度显示
plt.xlabel('bandwidth')
plt.ylabel('accuracy')
plt.title('KDE Model Performance')
print(grid.best_params_)
print('\naccuracy =', grid.best_score_)
print('cross_val_score(GaussianNB(), digits.data, digits.target).mean(): ',
cross_val_score(GaussianNB(), digits.data, digits.target, cv=5).mean())
plt.show()

留下评论

通过 WordPress.com 设计一个这样的站点
从这里开始