import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
# 用于在jupyter中进行绘图
%matplotlib inline
# 加载数据集
fruits_df = pd.read_table('fruit_data_with_colors.txt')
fruits_df.head()
fruit_label | fruit_name | fruit_subtype | mass | width | height | color_score | |
---|---|---|---|---|---|---|---|
0 | 1 | apple | granny_smith | 192 | 8.4 | 7.3 | 0.55 |
1 | 1 | apple | granny_smith | 180 | 8.0 | 6.8 | 0.59 |
2 | 1 | apple | granny_smith | 176 | 7.4 | 7.2 | 0.60 |
3 | 2 | mandarin | mandarin | 86 | 6.2 | 4.7 | 0.80 |
4 | 2 | mandarin | mandarin | 84 | 6.0 | 4.6 | 0.79 |
print('样本个数:', len(fruits_df))
样本个数: 59
# 创建目标标签和名称的字典
fruit_name_dict = dict(zip(fruits_df['fruit_label'], fruits_df['fruit_name']))
print(fruit_name_dict)
{1: 'apple', 2: 'mandarin', 3: 'orange', 4: 'lemon'}
# 划分数据集
X = fruits_df[['mass', 'width', 'height', 'color_score']]
y = fruits_df['fruit_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/4, random_state=0)
print('数据集样本数:{},训练集样本数:{},测试集样本数:{}'.format(len(X), len(X_train), len(X_test)))
数据集样本数:59,训练集样本数:44,测试集样本数:15
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for i in range(4):
print('归一化前,训练数据第{}维特征最大值:{:.3f},最小值:{:.3f}'.format(i + 1,
X_train.iloc[:, i].max(),
X_train.iloc[:, i].min()))
print('归一化后,训练数据第{}维特征最大值:{:.3f},最小值:{:.3f}'.format(i + 1,
X_train_scaled[:, i].max(),
X_train_scaled[:, i].min()))
print()
归一化前,训练数据第1维特征最大值:356.000,最小值:76.000
归一化后,训练数据第1维特征最大值:1.000,最小值:0.000
归一化前,训练数据第2维特征最大值:9.200,最小值:5.800
归一化后,训练数据第2维特征最大值:1.000,最小值:0.000
归一化前,训练数据第3维特征最大值:10.500,最小值:4.000
归一化后,训练数据第3维特征最大值:1.000,最小值:0.000
归一化前,训练数据第4维特征最大值:0.920,最小值:0.550
归一化后,训练数据第4维特征最大值:1.000,最小值:0.000
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_scaled, y_train)
print('准确率:{:.3f}'.format(gnb.score(X_test_scaled, y_test)))
准确率:0.667
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 50, 100, 150, 200]}
clf = GridSearchCV(RandomForestClassifier(random_state=0), parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)
print('最优参数:', clf.best_params_)
print('验证集最高得分:', clf.best_score_)
print('测试集准确率:{:.3f}'.format(clf.score(X_test, y_test)))
最优参数: {'n_estimators': 150}
验证集最高得分: 0.818181818182
测试集准确率:0.867
from sklearn.ensemble import GradientBoostingClassifier
parameters = {'learning_rate': [0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(GradientBoostingClassifier(), parameters, cv=3, scoring='accuracy')
clf.fit(X_train, y_train)
print('最优参数:', clf.best_params_)
print('验证集最高得分:', clf.best_score_)
print('测试集准确率:{:.3f}'.format(clf.score(X_test, y_test)))
最优参数: {'learning_rate': 0.001}
验证集最高得分: 0.795454545455
测试集准确率:0.733