关于SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame的解决方案

主要原因是无法确定到底返回的是一个引用还是一个拷贝

因此需要显示的赋值:

df[‘A’][df[‘B’]<1] = df[‘C’][df[‘B’]<1]

变为

df.loc[df[‘B’]<1,’A’] = df[‘C’][df[‘B’]<1]

参考:

https://blog.csdn.net/haolexiao/article/details/81180571

对数组中nan赋前值

# -*- coding: utf-8 -*-
import numpy as np
# 对于二维数组
arr = np.array([[5, np.nan, np.nan, 7, 2, 3],
                [3, np.nan, 1, 8, np.nan, 4],
                [4, 9, 6, np.nan, np.nan, 5]])
mask = np.isnan(arr)
print('mask shape is: ', mask.shape)
idx = np.where(~mask, np.arange(mask.shape[1]), 0)  # 按位判断~mask(非mask),真则为np.arange(mask.shape[1]), 假则为0
# 取历史合计最大值,输出为idx
np.maximum.accumulate(idx, axis=1, out=idx)
print('idx:\n', idx)  # nan值全部变为0,其余值则是自然序列值
print('np.maximum.accumulate(idx, axis=1, out=idx):\n', np.maximum.accumulate(idx, axis=1, out=idx))  # 只要是0,则用前面一个值占位,输出为idx
out = arr[np.arange(idx.shape[0])[:, None], idx]
print('np.arange(idx.shape[0])[:, None]:\n', np.arange(idx.shape[0])[:, None])
print('np.arange(idx.shape[0])[:, None]:\n', np.arange(idx.shape[0])[:, None])
print('arr:\n', arr)
print('out:\n', out)

# 对于一维数组:
arr2 = np.array([5, np.nan, np.nan, 7, 2, 3, np.nan, 4])
mask2 = np.isnan(arr2)
print('mask2:\n', mask2)
idx2 = np.where(~mask2, np.arange(len(mask2)), 0)  # 按位判断~mask(非mask),真则为np.arange(mask.shape[1]), 假则为0
np.maximum.accumulate(idx2, out=idx2)
out2 = arr2[idx2]
print('arr2:\n', arr2)
print('out2:\n', out2)

广播的数组坐标指定配值用法

a = np.array([[5, 2],
              [3, np.nan]])
b = np.array([[0],
             [1]])
c = np.array([[1, 1],
              [0, 1]])
print('a[b, c]:\n', a[b, c])
————————————————————
a[b, c]:
[[ 2. 2.]
[ 3. nan]]

这里把a数组当作手术对象,b表示行坐标,c表示列坐标
所以用坐标表示是[[(0,1)的值, (0,1)的值],
                 [(1,0)的值, (1,1)的值]]
即是:
[[ 2. 2.]
[ 3. nan]]

SUUMOからレンタル物件の採集

前回より、大雑把なエリアを決定し、今回はSUUMOからの物件をスクレイピングします。目的地は藤沢周辺です。

最初に総ページ数を見つけます:

number_of_pages = bs.find('ol', {'class': 'pagination-parts'}).find_all('li')[-1].find('a').get_text()  # 找出租房物件总页数
そして、一ページの情報採集、ループに入れて、出力します。
一個一個のページが終わったら、excelに更新して出力します。万が一ネット通信エラーとか、サバーに禁止されるとかの際に、取った情報を確保します。
余計な情報を略すため、藤沢と辻堂いずれかの情報があれば、別のファイル保存します。(Boolマスクを作る)
# 选出必定在藤泽和辻堂駅附近的房屋
new_df = df[(df['station_1'] == '藤沢駅') | (df['station_1'] == '辻堂駅') | (df['station_2'] == '藤沢駅') | (
df['station_2'] == '辻堂駅') | (df['station_3'] == '藤沢駅') | (df['station_3'] == '辻堂駅')]

Entire code:
# -*- coding: utf-8 -*
import urllib.request
import random
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import math
import time
import pandas as pd
import numpy as np
MAIN_URL = 'https://suumo.jp/jj/chintai/ichiran/FR301FC001/?ar=030&bs=040&ta=14&sc=14205&cb=0.0&ct=9999999&et=9999999&cn=9999999&mb=0&mt=9999999&tc=0400901&shkr1=03&shkr2=03&shkr3=03&shkr4=03&fw2=&page=' #
INFORMATION_DIC = {}
INFORMATION_DF = pd.DataFrame({'page_number': [], 'index_in_page': [], 'title': [],
'address': [], 'line_1': [], 'line_2': [], 'line_3': [], 'station_1': [],
'station_2': [], 'station_3': [], 'time_to_station_1': [], 'time_to_station_2': [],
'time_to_station_3': [], 'structure': [], 'rent': [], 'service_fee': [],
'deposit': [], 'gift': [], 'area': [], 'link': []}, index=None)


def span_over_pages(object_url, total_pages): # 对原始链接的所有页数爬虫
for i_page_number in range(1, total_pages + 1):
executed_url = ''.join([object_url, str(i_page_number)])
try:
execute_web_scrape(executed_url, i_page_number)
except Exception as e:
print('error!')
raise
else:
INFORMATION_DF.to_excel('租房信息.xlsx')
# print(df)
print('Page ', i_page_number, 'done.')
time.sleep(3)


def execute_web_scrape(executed_url, page_number): # 对当前页面进行爬取
global START_LOG
bs_item_boxes = BeautifulSoup(urlopen(executed_url).read(), 'html.parser').find_all('div',
{'class': 'cassetteitem_content'}) # 所有项目框
bs_agent_boxes = BeautifulSoup(urlopen(executed_url).read(), 'html.parser').find_all('table', {'class': 'cassetteitem_other'}) # 所有中介框
INFORMATION_DIC[page_number] = {}
for item_index_in_page in range(len(bs_item_boxes)): # 遍历每一条目
print('item {0} collected:\n'.format(item_index_in_page))
# 定位所有物件信息块
building_configs = bs_agent_boxes[item_index_in_page].find_all('tr', {'class': 'js-cassette_link'})
# 得到同栋楼物件信息块
room_information_blocks = [building_configs[i].find_all('li') for i in range(len(building_configs))]
house_address = bs_item_boxes[item_index_in_page].find_all('li', {'class': 'cassetteitem_detail-col1'})[
0].get_text() # 住所地址
add_bs = bs_item_boxes[item_index_in_page].find_all('div', {'class': 'cassetteitem_detail-text'}) # 通勤情况
commuting_list = [add_bs[i].get_text() for i in range(len(add_bs))] # 把通勤情况所有记录在列表里
INFORMATION_DIC[page_number][item_index_in_page] = {}
INFORMATION_DIC[page_number][item_index_in_page]['title'] = \
bs_item_boxes[item_index_in_page].find_all('div', {'class': 'cassetteitem_content-title'})[0].get_text()
INFORMATION_DIC[page_number][item_index_in_page]['add'] = commuting_list
house_name = bs_item_boxes[item_index_in_page].find_all('div', {'class': 'cassetteitem_content-title'})[
0].get_text() # 找出物件名称
route_list = list(filter(None, commuting_list)) # 去除空信息, 对象变list, 并去除/
line_used = [re.search(r'.+(\/)', route_list[i]).group().rstrip('/') for i in range(len(route_list))]
station_used = [re.search(r'(\/)[^\s]+', route_list[i]).group().lstrip('/') for i in range(len(route_list))]
time_to_station_phrase = [re.search(r'(\s).{1,2}(\d)+.', route_list[i]).group() for i in range(len(route_list))]
# time_to_station_phrase = [re.search(r'(\s).+', route_list[i]).group() for i in range(len(route_list))]
# 得到移动时间列表
time_to_station_phrase_splited = [re.split('(\d+)', time_to_station_phrase[i]) for i in range(len(route_list))]
time_to_stations = np.full(3, np.nan) # 用nan占位
lines = [np.nan, np.nan, np.nan]
stations = [np.nan, np.nan, np.nan]
for each_approach in range(len(time_to_station_phrase_splited)): # 赋真实值
# 得到移动时间
time_to_stations[each_approach] = time_to_station_phrase_splited[each_approach][1]
# 得到乘坐线路
lines[each_approach] = line_used[each_approach]
# 得到乘坐站点
stations[each_approach] = station_used[each_approach]
# 添加该租房项目记录
for i in range(len(room_information_blocks)):
rent = re.search(r'(\d)+(\.){0,1}(\d)*', room_information_blocks[i][0].get_text()).group()
service_fee = lambda x: x if len(x) == 1 else float(re.search(r'(\d)+(\.){0,1}(\d)*', x).group()) / 10000
deposit = lambda x: x if len(x) == 1 else re.search(r'(\d)+(\.){0,1}(\d)*', x).group()
gift = lambda x: x if len(x) == 1 else re.search(r'(\d)+(\.){0,1}(\d)*', x).group()
structure = room_information_blocks[i][4].get_text()
area = re.search(r'(\d)+(\.){0,1}(\d)*', room_information_blocks[i][5].get_text()).group()
inter_link = building_configs[i].find('td', {'class': 'ui-text--midium ui-text--bold'}).find('a', {
'class': 'js-cassette_link_href cassetteitem_other-linktext'}).attrs['href']
INFORMATION_DF.loc[START_LOG] = [page_number, item_index_in_page, house_name,
house_address, lines[0], lines[1], lines[2],
stations[0], stations[1], stations[2],
time_to_stations[0],
time_to_stations[1], time_to_stations[2],
structure, rent,
service_fee(room_information_blocks[i][1].get_text()),
deposit(room_information_blocks[i][2].get_text()),
gift(room_information_blocks[i][3].get_text()), area,
''.join(['https://suumo.jp/', inter_link])]
START_LOG = 1 + START_LOG
# print('dic is: \n', INFORMATION_DIC)


if __name__ == '__main__':
START_LOG = 0
bs = BeautifulSoup(urlopen(''.join([MAIN_URL, str(1)])).read(), 'html.parser')
number_of_pages = bs.find('ol', {'class': 'pagination-parts'}).find_all('li')[-1].find('a').get_text() # 找出租房物件总页数
print('number_of_pages is: ', number_of_pages)
span_over_pages(MAIN_URL, int(number_of_pages))
df = pd.read_excel('租房信息.xlsx', index=None)
# 选出必定在藤泽和辻堂駅附近的房屋
new_df = df[(df['station_1'] == '藤沢駅') | (df['station_1'] == '辻堂駅') | (df['station_2'] == '藤沢駅') | (
df['station_2'] == '辻堂駅') | (df['station_3'] == '藤沢駅') | (df['station_3'] == '辻堂駅')]
new_df.to_excel('rental_information.xlsx', index=None)

指定駅まで所要時間の採集

東京都内までの通勤の苦労を控えるため、便利な場所を見つけるスクリプトです。

まず、このサイト(https://ensenmin.com/first)による候補者を取ります。

候補者の中に重複の駅名をまず取り出します。

そして、Pythonのseleniumモジュールを利用し、ブラザーの行動を模擬します。
Yahoo乗換案内(https://transit.yahoo.co.jp/)のソースを見ながら、regexで情報を特定します。
取り出したい情報は始発駅、始発時間、到着時間、乗り換え数です。
時間コストはTimestampで差を計算し、Strに変更してDataFrameに次々と更新します。
最後にExcelで出力します。

そして、新宿駅までの経由も調べて、並べれば、下記のようです。

家賃も考えて、住みたいところは選べると思います。

コードは下記です。

# -*- coding: utf-8 -*
import urllib.request
import random
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import math
import time
import datetime
import pandas as pd
import numpy as np
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

STATION_RECOMMENDATION_LINK = 'https://ensenmin.com/first' #
INFORMATION_DIC = {}
INFORMATION_DF = pd.DataFrame({'index': [], 'start_station': [], 'destination': [],
'depart_time': [], 'reach_time': [], 'time_cost': [], 'transfer_times': []}, index=None)
STATION_CANDIDATES = []
yahoo_link = 'https://transit.yahoo.co.jp/'


def look_up_route(depart_station, destination, index):
chrome_driver = webdriver.Chrome(r'C:\Users\cdkag\Desktop\租房爬虫\chromedriver.exe')
chrome_driver.get(yahoo_link)
chrome_driver.find_element_by_id('sfrom').send_keys(depart_station)
chrome_driver.find_element_by_id('sto').send_keys(destination) # 到达站选择
chrome_driver.find_element_by_id('air').click()
chrome_driver.find_element_by_id('sexp').click()
chrome_driver.find_element_by_id('exp').click()
chrome_driver.find_element_by_id('hbus').click()
chrome_driver.find_element_by_id('bus').click()
chrome_driver.find_element_by_id('fer').click() # 排除渡轮
chrome_driver.find_element_by_id('tsFir').click() # 始发选择
chrome_driver.find_element_by_id('y').send_keys('2020') # 时刻表年份
chrome_driver.find_element_by_id('m').send_keys('3') # 时刻表月份
chrome_driver.find_element_by_id('d').send_keys('2') # 时刻表天数
chrome_driver.find_element_by_id('hh').send_keys('6') # 时刻表小时
chrome_driver.find_element_by_id('mm').send_keys('0') # 时刻表分钟
chrome_driver.find_element_by_class_name('optSort').find_element_by_tag_name('select').send_keys('乗り換え回数順')
element = chrome_driver.find_element_by_id('searchModuleSubmit') # 定位提交按钮
element.submit() # 提交页面
# print('current_url:\n', chrome_driver.current_url) # 读取当前页面地址
new_page = BeautifulSoup(urlopen(chrome_driver.current_url).read(), 'html.parser')
start_time_bs = new_page.find_all('li', {'class': 'time'})[1].get_text()
depart_time = re.search(r'(\d){2}(\:)(\d){2}', start_time_bs).group()
print('出发时间: ', depart_time)
reach_time = new_page.find_all('li', {'class': 'time'})[1].get_text()
reach_time = re.search(r'(\→)(\d){2}(\:)(\d){2}', reach_time).group()[1:]
# reach_time = new_page.find_all('li', {'class': 'time'})[1].find('span', {'class': 'mark'}).get_text()
print('到达时间: ', reach_time)
start_time_cal = datetime.datetime.strptime(depart_time, '%H:%M') # 获取最新日期表示为时间戳类型
reach_time_cal = datetime.datetime.strptime(reach_time, '%H:%M') # 获取最新日期表示为时间戳类型
time_cost = str(re.search(r'(\d){1}(\:)(\d){2}', str(reach_time_cal - start_time_cal)).group())
print(int(time_cost[:1]), int(time_cost[2:]))
time_cost = int(time_cost[:1]) * 60 + int(time_cost[2:])
print('耗时: ', time_cost, ' mins')
transfer_times = re.search(r'(\d)+', new_page.find_all('li', {'class': 'transfer'})[0].find('span', {
'class': 'mark'}).get_text()).group()
print('换乘次数: ', transfer_times)
INFORMATION_DF.loc[index] = [index, depart_station, destination,
depart_time, reach_time, time_cost, transfer_times]
INFORMATION_DF.to_excel('Yahoo时刻表.xlsx')
chrome_driver.close()


if __name__ == '__main__':
dic_candi = ['大崎', '品川', '池袋', '東京', '新宿', '御茶ノ水', '上野', '田端', '渋谷', '虎ノ門', '茗荷谷', '後楽園', '霞ヶ関', '広尾', '半蔵門', '水天宮前', '白金高輪', '市ヶ谷', '駒込', '溜池山王', '新宿三丁目', '泉岳寺', '新線新宿', '岩本町', '御成門', '都庁前', '汐留', '蒲田', '赤羽', '東十条', '新木場', '中野', '成城学園前', '経堂', '二子玉川', '桜上水', '八幡山', '富士見ヶ丘', '上石神井', '光が丘', '石神井公園', '豊島園', '成増', '上板橋', '竹ノ塚', '青砥', '高砂', '浅草', '荻窪', '中野富士見町', '中目黒', '北千住', '南千住', '八丁堀', '代々木上原', '綾瀬', '北綾瀬', '押上', '清澄白河', '東陽町', '京急蒲田', '住吉', '赤羽岩淵', '王子神谷', '小竹向原', '西馬込', '浅草橋', '大島', '笹塚', '西高島平', '高島平', '新板橋', '新御徒町', '東京テレポート', '高尾', '八王子', '豊田', '武蔵小金井', '立川', '青梅', '武蔵五日市', '奥多摩', '河辺', '三鷹', '町田', '稲城長沼', '西国立', '府中本町', '唐木田', '多摩センター', '高尾山口', '京王八王子', '高幡不動', '若葉台', 'つつじヶ丘', '府中', '北野', '吉祥寺', '清瀬', '保谷', '拝島', '玉川上水', '西武遊園地', '磯子', '鶴見', '東神奈川', '桜木町', '小机', '中山', '矢向', '長津田', '菊名', '横浜', '元町・中華街', '日吉', '金沢文庫', '神奈川新町', '川崎', '京急川崎', '登戸', '武蔵中原', '武蔵溝ノ口', '溝の口', '向ヶ丘遊園', '新百合ヶ丘', '鷺沼', '武蔵小杉', '元住吉', '久里浜', '逗子', '大船', '横須賀', '平塚', '小田原', '国府津', '藤沢', '二宮', '橋本', '茅ヶ崎', '本厚木', '新松田', '秦野', '伊勢原', '海老名', '相模大野', '相武台前', '片瀬江ノ島', '中央林間', '三崎口', '浦賀', '新逗子', '三浦海岸', '堀ノ内', 'かしわ台', '大和', '二俣川', '千葉', '幕張', '蘇我', '海浜幕張', '千葉中央', '津田沼', '西船橋', '君津', '上総一ノ宮', '佐倉', '成東', '成田', '成田空港', '木更津', '我孫子', '松戸', '柏', '新習志野', '上総湊', '勝浦', '南船橋', '京成臼井', '京成佐倉', '京成成田', '京成大和田', '宗吾参道', '東成田', 'ちはら台', '芝山千代田', '印旛日本医大', '印西牧の原', '妙典', '浦安', '東葉勝田台', '八千代緑が丘', '本八幡', '大宮', '南浦和', '武蔵浦和', '指扇', '浦和美園', '川越市', '本川越', '南古谷', '高麗川', '籠原', '深谷', '東所沢', '南越谷', '飯能', '小手指', '所沢', '狭山市', '小川町', '森林公園', '志木', '上福岡', '久喜', '東武動物公園', '北越谷', '北春日部', '南栗橋', '八潮', '鳩ケ谷', '和光市', '鹿島神宮', '古河', '取手', '土浦', '勝田', '水戸', '高萩', 'つくば', '守谷', '宇都宮', '小金井', '黒磯', '氏家', '高崎', '新前橋', '前橋', '館林', '熱海', '伊東', '沼津', '大月', '河口湖']
START_LOG = 0
df = pd.read_html(STATION_RECOMMENDATION_LINK)
for i in range(1, len(df[-1])):
phrase = re.split('、', df[-1][1].iloc[i])
for j in range(len(phrase)):
STATION_CANDIDATES.append(phrase[j])
print(len(STATION_CANDIDATES))
print(STATION_CANDIDATES)
dic_candi.remove('東京')
for each in range(len(dic_candi)):
look_up_route(dic_candi[each], '東京', START_LOG)
START_LOG = 1 + START_LOG
time.sleep(1)

用HOG特征进行固定尺寸的人脸识别

# -*- coding: UTF-8 -*-
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from skimage import data, color, feature
from sklearn.datasets import fetch_lfw_people
import skimage.data
from itertools import chain # 用于组合样本
from skimage import data, transform
from sklearn.feature_extraction.image import PatchExtractor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
sns.set()


def sliding_window(img, patch_size, istep=2, jstep=2, scale=1): # 然而当要预测照片的尺寸和学习模型学习的尺寸不一样时候,效果打折
print('patch_size:', patch_size)
Ni, Nj = (int(scale * s) for s in patch_size) # 找到缩略图像素框总长宽
print('sliding_window_img.shape: ', img.shape)
# print('Ni:', Ni)
# print('Nj:', Nj)
for i in range(0, img.shape[0] - Ni, istep): # 长度方向步进2
for j in range(0, img.shape[1] - Ni, jstep): # 宽度方向步进2
patch = img[i:i + Ni, j:j + Nj] # 新缩略图大小被步进窗口赋值
if scale != 1: # 不是1的话,则化为patch_size大小
patch = transform.resize(patch, patch_size)
yield (i, j), patch # 生成返回窗口坐标,HOG特征窗口对象


def extract_patches(img, N, patch_size, scale=1.0): # N是每个尺寸照片的训练样本数,patch_size是用来显示的尺度
print('img.shape:\n', img.shape)
# print('img[np.newaxis].shape:\n', img[np.newaxis].shape)
print('scale:\n', scale)
print('patch_size):\n', patch_size)
print('scale * np.array(patch_size):\n', scale * np.array(patch_size)) # 元组化为数组后扩大缩小尺寸
extracted_patch_size = tuple((scale * np.array(patch_size)).astype(int)) # 对原图数据放大缩小后,numpy.float64化为元组,抽出尺寸化为整形
print('extracted_patch_size:\n', extracted_patch_size)
# PatchExtractor类支持多幅图像作为输入
extractor = PatchExtractor(patch_size=extracted_patch_size, max_patches=N, random_state=0) # 能按照不同size抽出的抽出器
patches = extractor.transform(img[np.newaxis]) # img[np.newaxis]增加一个维度,eg: (872, 1000) -> (1, 872, 1000),抽出转换
# print('patches.shape:', patches.shape, 'scale:', scale) # N*(img.shape转换为缩略图的维度)
if scale != 1:
# 尺寸需要被改变,变为patch_size
patches = np.array([transform.resize(patch, patch_size) for patch in patches]) # 改变图片尺寸resize
# print('patches.shape', patches.shape, 'scale:', scale) # (N*patch_size)维
return patches


image = color.rgb2gray(data.chelsea()) # color.rgb2gray(): RGB转为灰度图; data.chelsea()代表小猫切尔西
print('feature.hog(image, visualize=True):\n', feature.hog(image, visualize=True)) # 输入图像, visualize:是否输出HOG image梯度图
hog_vec, hog_vis = feature.hog(image, visualize=True)
fig, ax = plt.subplots(1, 2, figsize=(12, 6), subplot_kw=dict(xticks=[], yticks=[]))
ax[0].imshow(image, cmap='gray')
print('image.shape: ', image.shape)
ax[0].set_title('input image')
ax[1].imshow(hog_vis)
print('hog_vis.shape: ', hog_vis.shape)
ax[1].set_title('visualization of HOG features')

faces = fetch_lfw_people()
positive_patches = faces.images # 获取含有人脸的图片作为正训练样本
print('positive_patches.shape:', positive_patches.shape)
imgs_to_use = ['camera', 'text', 'coins', 'moon', 'page', 'clock', 'immunohistochemistry', 'chelsea',
'coffee', 'hubble_deep_field']
# 把这些主题的照片都转换成灰度图
# print('getattr(data, name)', getattr(data, imgs_to_use[0]))
images = [color.rgb2gray(getattr(data, name)()) for name in imgs_to_use] # getattr(data, name)返回data对象的一个name属性,是函数对象
# 对所有负训练对象的图片做抽出缩略图操作,化为和正训练对象一样尺寸大小
# np.vstack()能去除冗余维度
negative_patches = np.vstack([extract_patches(im, 1000, positive_patches[0].shape, scale) for im in images for scale in [0.5, 1.0, 2.0]])
print('negative_patches.shape:', negative_patches.shape)

fig2, ax2 = plt.subplots(6, 10)
for i, axi in enumerate(ax2.flat):
axi.imshow(negative_patches[500 * i], cmap='gray')
axi.axis('off')

X_train = np.array([feature.hog(im) for im in chain(positive_patches, negative_patches)]) # 将正负样本组合,计算HOG特征
y_train = np.zeros(X_train.shape[0]) # 用0填充X_train样本数作为长度的数组
# print('X_train.shape:\n', X_train.shape)
# print('positive_patches.shape:\n', positive_patches.shape)
y_train[:positive_patches.shape[0]] = 1 # 设开始正样本结果为1
# print('X_train.shape: ', X_train.shape)
# cv的意思是验证5次结果
print('cross_val_score(GaussianNB(), X_train, y_train):', cross_val_score(GaussianNB(), X_train, y_train, cv=3))
grid = GridSearchCV(LinearSVC(), {'C': [0.0625, 0.125, 0.25, 0.5]}, cv=3)
grid.fit(X_train, y_train)
print('grid.best_score_: ', grid.best_score_)
print('grid.best_params_: ', grid.best_params_)
# 用最优参数得到最优评估器
model = grid.best_estimator_
model.fit(X_train, y_train)
model.fit(X_train, y_train)

fig3, ax3 = plt.subplots()
test_image = skimage.data.astronaut() # 给一个航天员作为未知新图输入
test_image = skimage.color.rgb2gray(test_image)
test_image = skimage.transform.rescale(test_image, 0.5) # 缩小到0.5倍
test_image = test_image[:160, 40:180] # 改成适应学习的尺寸
plt.imshow(test_image, cmap='gray')
plt.axis('off')
print('test_image.shape:', test_image.shape)
indices, patches = zip(*sliding_window(test_image, positive_patches[0].shape)) # indices:赋值矩形左下角坐标,patches:方框们
patches_hog = np.array([feature.hog(patch) for patch in patches]) # 计算生成器产生的HOG特征
print('patches_hog.shape:', patches_hog.shape)

labels = model.predict(patches_hog) # 预测有HOG特征的图
print('The total number of the pictures with human faces is: ', labels.sum()) # 这幅图能被确定到的人脸方框数量

fig4, ax4 = plt.subplots()
ax4.imshow(test_image, cmap='gray')
ax4.axis('off')

Ni, Nj = (positive_patches[0]).shape # 高,宽
indices = np.array(indices)

for i, j in indices[labels == 1]:
ax4.add_patch(plt.Rectangle((j, i), Nj, Ni, edgecolor='red', alpha=0.3, lw=2, facecolor='none'))
plt.show()

用KDE做贝叶斯生成分类

# -*- coding: UTF-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KernelDensity
from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score


class KDEClassifier(BaseEstimator, ClassifierMixin): # 基于KDE的贝叶斯生成分类
# bandwidth是每个类中的核带宽(float);kernel是核函数名称
def __init__(self, bandwidth=1.0, kernel='gaussian'):
self.bandwidth = bandwidth
self.kernel = kernel

def fit(self, X, y):
self.classes_ = np.sort(np.unique(y)) # np.unique():去除数组中的重复数字,并进行排序之后输出,小到大
print('X:\n', X)
print('y:\n', y)
print('self.classes_:\n', self.classes_)
# 在训练数据集中找到所有标签类,去掉重复标签
training_sets = [X[y == yi] for yi in self.classes_] # 按y里的元素从小到大的顺序遍历X的行数,行索引号由出现y里元素index号决定
# 为每个类(各traning_set)训练一个KernelDensity模型
self.models_ = [KernelDensity(bandwidth=self.bandwidth, kernel=self.kernel).fit(Xi) for Xi in training_sets]
# 计算类的先验概率,某类标签yi时总计概率Xi/X
self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0]) for Xi in training_sets]
return self

# 预测新数据概率
def predict_proba(self, X):
# 得到每个类别的概率密度估计值形成的数组shape:[n_samples*n_classes],这里数组中[i,j]表示样本i属于j类的后验概率
logprobs = np.array([model.score_samples(X) for model in self.models_]).T # 这里因为是矩阵,所以np.array和np.vstack都是一样效果
# 数字0这个训练模型训练出来的概率密度,元素数量取决于带宽
print('self.models_[0].score_samples(X).shape:\n', self.models_[0].score_samples(X).shape)
# 模型不同预测所需要的n_samples的特征数组也不一样, 359可以是357等等,有的模型是预测0,有的模型是预测1
print('logprobs.shape:\n', logprobs.shape) # (359, 10)之类
print('logprobs:\n', logprobs)
# 各自取指数还原,然后乘以先验概率
result = np.exp(logprobs + self.logpriors_)
# 返回归一化后的result统一单位
return result / result.sum(1, keepdims=True)

def predict(self, X):
# 返回拥有最大概率的类别
print('self.classes_[np.argmax(self.predict_proba(X), 1)]:\n', self.classes_[np.argmax(self.predict_proba(X), 1)])
return self.classes_[np.argmax(self.predict_proba(X), 1)] # np.argmax():返回最大值索引, 0代表跨行查找最大(即列), 1代表跨列查找最大(即行)


digits = load_digits()
print('digits.data.shape:\n', digits.data.shape)
bandwidths = 10 ** np.linspace(0, 2, 100)
grid = GridSearchCV(KDEClassifier(), {'bandwidth': bandwidths}, cv=5, iid=False) # , cv=5, iid=False
grid.fit(digits.data, digits.target)

# print("grid.cv_results_['mean_test_score']:\n", grid.cv_results_['mean_test_score'])
scores = [val for val in grid.cv_results_['mean_test_score']]
plt.semilogx(bandwidths, scores) # 把x轴用对数刻度显示
plt.xlabel('bandwidth')
plt.ylabel('accuracy')
plt.title('KDE Model Performance')
print(grid.best_params_)
print('\naccuracy =', grid.best_score_)
print('cross_val_score(GaussianNB(), digits.data, digits.target).mean(): ',
cross_val_score(GaussianNB(), digits.data, digits.target, cv=5).mean())
plt.show()
通过 WordPress.com 设计一个这样的站点
从这里开始