import pandas as pd
import numpy as np
读入数据
data = pd.read_csv("./datasets/50_Startups.csv")
data.head()
|
|
R&D Spend
|
Administration
|
Marketing Spend
|
State
|
Profit
|
|
0
|
165349.20
|
136897.80
|
471784.10
|
New York
|
192261.83
|
|
1
|
162597.70
|
151377.59
|
443898.53
|
California
|
191792.06
|
|
2
|
153441.51
|
101145.55
|
407934.54
|
Florida
|
191050.39
|
|
3
|
144372.41
|
118671.85
|
383199.62
|
New York
|
182901.99
|
|
4
|
142107.34
|
91391.77
|
366168.42
|
Florida
|
166187.94
|
分开xy
X = data.iloc[:,:-1].values
Y = data.iloc[:,-1].values
编码
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelEncoder = LabelEncoder()
X[:,3] = labelEncoder.fit_transform(X[:,3])
onehotencoder = OneHotEncoder()
X = onehotencoder.fit_transform(X).toarray()
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
LinearRegression()