线性模型假设目标值是特征值的线性组合。
适合连续值的预测
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
import numpy as np
import pandas as pd
#生成随机数的X,然后对X进行变换得到y.
X = np.random.rand(200,3) #200行,3列
y = 3 * X[:, 0] + 2 * X[:, 1] + 1 + np.random.randn(200) * 0.1+X[:,2] * X[:,1]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100)
#训练模型
lr=LinearRegression()
lr.fit(X_train,y_train)
#预测和评估结果
y_pred=lr.predict(X_test)
print("R2:",r2_score(y_test,y_pred))#输出预测的R2
print("Coefficients:",lr.coef_) #输出参数值
print("Intercept:",lr.intercept_) #输出截距
在线性回归基础上加入L2正则化(惩罚系数平方和),防止过拟合
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0) #alpha越大正则化越强
ridge.fit(X_train,y_train)
y_pred=ridge.predict(X_test)
print(r2_score(y_test,y_pred))
在线性回归基础上加入L1正则化(惩罚系数绝对值和),具有特征选择能力(部分系数变为0)
from sklearn.linear_model import Lasso
lasso=Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
y_pred=lasso.predict(X_test)
print(r2_score(y_test,y_pred),'\n',lasso.coef_,'\n',lasso.intercept_)
用于二分类或多分类,输出概率值。
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
#生产随机的1000个样本,特征数是4,类别是2类
X,y = make_classification(n_samples=100,n_features=4,n_classes=2,random_state=100)
#指定分割训练集和测试集的比例,测试集比例是0.2
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
#实例化模型
logistic_regression = LogisticRegression(C=1.0,solver='lbfgs')
#对训练集进行fit
logistic_regression.fit(X_train,y_train)
y_pred = logistic_regression.predict(X_test) #预测类别
y_proba = logistic_regression.predict_proba(X_test) #预测概率
print(logistic_regression.score(X_test,y_test))#输出预测的准确率
print(y_proba) #输出预测的概率值,会返回是每个类别的概率
SVM通过寻找最大间隔超平面进行分类/回归,对高位数据表现优异。
分类任务
#支持向量分类SVC
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
X,y = make_classification(n_samples=200,n_features=6,n_classes=2)
X_train,X_test,y_train,y_test=train_test_split(X,y)
#SVM对特征尺度敏感,必须标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svc = SVC(kernel='rbf',C=1.0,gamma='scale')
svc.fit(X_train_scaled,y_train)
y_pred=svc.predict(X_test_scaled)
print('准确率:',(y_test==y_pred).mean())
回归任务
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
#SVM对特征尺度敏感,必须标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svr = SVR(kernel='rbf',C=1.0,epsilon=0.1)
svr.fit(X_train_scaled,y_train)
y_pred=svr.predict(X_test_scaled)
print(r2_score(y_test,y_pred))
| 核函数 | 代码 | 适用场景 |
|---|---|---|
| 线性核 | kernel=’linear’ | 线性可分,高纬稀疏矩阵(如文本) |
| 多项式核 | kernel=’poly’,degree=1 | 特征有交互关系 |
| RBF核 | kernel=’rbf’ | 默认推荐:适用于非线性问题 |
RBF和poly对gamma敏感,gamma=’scale’(默认)或gamma=’auto’