通过递归划分特征空间,使子节点”纯度”最高
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier,plot_tree
import matplotlib.pyplot as plt
X,y = make_classification(n_samples=200,n_features=5,n_classes=2)
X_train,X_test,y_train,y_test = train_test_split(X,y)
tree = DecisionTreeClassifier(
criterion='gini', #或"entropy"
max_depth=3,
min_samples_split=10,
random_state=100
)
tree.fit(X_train,y_train)
y_pred=tree.predict(X_test)
print((y_pred==y_test).mean())
#可视化
plt.figure(figsize=(12,8))
plot_tree(tree,feature_names=['X1','X2','X3','X4','X5'],class_names=['0','1'],filled=True)
plt.show()
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
X,y = make_classification(n_samples=200,n_features=5,n_classes=2)
X_train,X_test,y_train,y_test = train_test_split(X,y)
rf = RandomForestClassifier(
n_estimators=100,
max_depth=5,
min_samples_split=5,
oob_score=True, #使用袋外样本评估
random_state=100,
n_jobs=-1 #使用所有cpu
)
rf.fit(X_train,y_train)
y_pred=tree.predict(X_test)
print((y_pred==y_test).mean())
print(rf.score(X_test,y_test))
print('Feature Importances:',rf.feature_importances_)
| 模型 | 关键超参数 | 调优建议 |
|---|---|---|
| DecisionTree | max_depth, min_samples_split, min_samples_leaf | 从 max_depth=3~10 开始 |
| RandomForest | n_estimators, max_depth, max_features | n_estimators 越大越好(到性能瓶颈);max_features=’sqrt’(分类)或 ‘log2’(回归) |
| SVM | C, gamma, kernel | 先选核(RBF 默认),再网格搜索 C 和 gamma |
| LogisticRegression | C, penalty | 用 LogisticRegressionCV 自动选 C |
使用GridSearchCV自动调参(以Random Forest为例)
#首先加载和设置好随机森林的算法的内容
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X,y = make_classification(n_samples=200,n_features=5,n_classes=2)
X_train,X_test,y_train,y_test = train_test_split(X,y)
rf = RandomForestClassifier(
random_state=100,
n_jobs=8 #使用8个cpu
)
#加载自动调参模块
from sklearn.model_selection import GridSearchCV
param_grid = {
'n_estimators':[50,100,200],
'max_depth':[3,5,7,9,None],
'max_features':['sqrt','log2']
}
grid = GridSearchCV(
estimator=rf,
param_grid=param_grid,
cv=5,
scoring='accuracy',
n_jobs=8,#使用8个cpu
verbose=0 #打印进度:0无日志 1简要 2详细
)
grid.fit(X_train,y_train)
#提取最优模型(已经全量训练集+最优参数训练完成)
best_model = grid.best_estimator_
print('Best params:',grid.best_params_)
print('Best CV score:',grid.best_score_)
#使用最优模型预测测试集
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test) #预测概率
print('准确率:',(y_test==y_pred).mean())
#保存模型到本地,可供后续使用
import joblib
joblib.dump(best_model,'best_random_forest_classifier.pkl')
#加载模型并预测
loaded_model = joblib.load('best_random_forest_classifier.pkl')
y_pred_loaded = loaded_model.predict(X_test)
KNN的算法核心就是找到最近的几个邻居,投票(分类)或平均(回归)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn = KNeighborsClassifier(
n_neighbors=5,#设置近邻的数量
weights='uniform',#或‘distance’
metric='minkowski',# 距离度量
p=2 #p=2是欧氏距离 p=1是曼哈顿距离
)
knn.fit(X_train_scaled,y_train)
y_pred = knn.predict(X_test_scaled)
print('KNN Accuracy:',knn.score(X_test_scaled,y_test))
print((y_pred==y_test).mean())
| 参数 | 选项 | 说明 |
|---|---|---|
| metric | ‘euclidean’, ‘manhattan’, ‘chebyshev’, ‘minkowski’ | 控制距离计算方式 |
| weights | ‘uniform’(等权) | |
| ‘distance’(权重 = 1 / 距离) | 距离近的样本影响更大 |
KNN的致命弱点: