数据预处理
目录
导入必要的包
import pandas as pd
import numpy as np
读入数据
= pd.read_csv("./datasets/Data.csv")
data data.head()
Country | Age | Salary | Purchased | |
---|---|---|---|---|
0 | France | 44.0 | 72000.0 | No |
1 | Spain | 27.0 | 48000.0 | Yes |
2 | Germany | 30.0 | 54000.0 | No |
3 | Spain | 38.0 | 61000.0 | No |
4 | Germany | 40.0 | NaN | Yes |
确定X,Y变量
= data.iloc[:,:-1].values
X = data.iloc[:,3].values Y
数据空值处理
from sklearn.impute import SimpleImputer
= SimpleImputer()
imputer 1:3] = imputer.fit_transform(X[:,1:3])
X[:, X
array([['France', 44.0, 72000.0],
['Spain', 27.0, 48000.0],
['Germany', 30.0, 54000.0],
['Spain', 38.0, 61000.0],
['Germany', 40.0, 63777.77777777778],
['France', 35.0, 58000.0],
['Spain', 38.77777777777778, 52000.0],
['France', 48.0, 79000.0],
['Germany', 50.0, 83000.0],
['France', 37.0, 67000.0]], dtype=object)
编码处理
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
= LabelEncoder()
labelencoder 0] = labelencoder.fit_transform(X[:,0]) X[:,
= OneHotEncoder()
onehotencoder = onehotencoder.fit_transform(X).toarray()
X = LabelEncoder()
labelencoder_Y = labelencoder_Y.fit_transform(Y) Y
分割数据
from sklearn.model_selection import train_test_split
= train_test_split(X,Y,test_size=0.2,
X_train,X_test,Y_train,Y_test =0) random_state
标准化
from sklearn.preprocessing import StandardScaler
= StandardScaler()
sc_X = sc_X.fit_transform(X_train)
X_train = sc_X.fit_transform(X_test) X_test