导入必要的包
import pandas as pd
import numpy as np
读入数据
data = pd.read_csv("./datasets/Data.csv")
data.head()
|
Country
|
Age
|
Salary
|
Purchased
|
0
|
France
|
44.0
|
72000.0
|
No
|
1
|
Spain
|
27.0
|
48000.0
|
Yes
|
2
|
Germany
|
30.0
|
54000.0
|
No
|
3
|
Spain
|
38.0
|
61000.0
|
No
|
4
|
Germany
|
40.0
|
NaN
|
Yes
|
确定X,Y变量
X = data.iloc[:,:-1].values
Y = data.iloc[:,3].values
数据空值处理
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
X[:,1:3] = imputer.fit_transform(X[:,1:3])
X
array([['France', 44.0, 72000.0],
['Spain', 27.0, 48000.0],
['Germany', 30.0, 54000.0],
['Spain', 38.0, 61000.0],
['Germany', 40.0, 63777.77777777778],
['France', 35.0, 58000.0],
['Spain', 38.77777777777778, 52000.0],
['France', 48.0, 79000.0],
['Germany', 50.0, 83000.0],
['France', 37.0, 67000.0]], dtype=object)
编码处理
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder = LabelEncoder()
X[:,0] = labelencoder.fit_transform(X[:,0])
onehotencoder = OneHotEncoder()
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
分割数据
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,
random_state=0)
标准化
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)