使用Python移除异常值线性回归

时间:2018-10-02 13:48:51

标签: python scikit-learn linear-regression cross-validation training-data

这是我为简单线性回归创建的代码。这是代码,我有几个问题正在寻找答案。 如何从X和Y中检测和删除异常值,也许代码示例会有所帮助? 您如何看待模型部分的培训和评估质量? 正确的交叉验证?训练测试集? 如何解释RMSE值?大价值是一个好兆头吗?

     import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy import stats

# Import Excel File
data = pd.read_excel  ("C:\\Users\\AchourAh\\Desktop\\Simple_Linear_Regression\\SP Level Simple Linear Regression\\PL32_PMM_03_09_2018_SP_Level.xlsx",'Sheet1') #Import Excel file

# Replace null values of the whole dataset with 0
data1 = data.fillna(0)
print(data1)

 # Extraction of the independent and dependent variable
 X = data1.iloc[0:len(data1),1].values.reshape(-1, 1) #Extract the column of the COPCOR SP we are going to check its impact
 Y = data1.iloc[0:len(data1),2].values.reshape(-1, 1) #Extract the column of the PAUS SP


 # Data Splitting to train and test set
 from sklearn.model_selection import train_test_split
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size =0.25,random_state=42)


 # Training the model and Evaluation of the Model
 from sklearn.linear_model import LinearRegression
 from sklearn import model_selection
 lm = LinearRegression() #create an lm object of LinearRegression Class
 lm.fit(X_train, Y_train)  #train our LinearRegression model using the training set of data - dependent and independent variables as parameters. Teaching lm  that Y_train values are all corresponding to X_train values.
 from sklearn.model_selection import KFold
 kf = KFold(n_splits=6, random_state=None)
 for train_index, test_index in kf.split(X_train   
 print("Train:", train_index, "Validation:",test_index)
 X_train1, X_test1 = X[train_index], X[test_index]
 Y_train1, Y_test1 = Y[train_index], Y[test_index]
results = -1 * model_selection.cross_val_score(lm, X_train1,       Y_train1,scoring='neg_mean_squared_error', cv=kf)
print(results)
print(results.mean())
y_pred = lm.predict(X_test)
from sklearn.metrics import mean_squared_error
 mse_test = mean_squared_error(Y_test, y_pred)
 print(mse_test)
 import math
 print(math.sqrt(mse_test))
 print(math.sqrt(results.mean()))
 df = pd.DataFrame({'Actual': [Y_test], 'Predicted': [y_pred]})
 print(df)



 # Graph of the Training model
 plt.scatter(X_train, Y_train, color = 'red')#plots scatter graph of COP COR against PAUS for values in X_train and y_train
 plt.plot(X_train, lm.predict(X_train), color = 'blue')#plots the graph of   predicted PAUS against COP COR.
 plt.title('SP000905974')
 plt.xlabel('COP COR Quantity')
 plt.ylabel('PAUS Quantity')
 plt.show()#Show the graph

 # Statistical Analysis of the training set with Statsmodels
 X2 = sm.add_constant(X_train) # add a constant to the model
 est = sm.OLS(Y_train, X2).fit()
 print(est.summary()) # print the results

 # Statistical Analysis of the training set with Scikit-Learn
 params = np.append(lm.intercept_,lm.coef_)
 predictions = lm.predict(X_train)
 newX = pd.DataFrame({"Constant":np.ones(len(X_train))}).join(pd.DataFrame (X_train))
 MSE = (sum((Y_train-predictions)**2))/(len(newX)-len(newX.columns))
 var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
 sd_b = np.sqrt(var_b)
 ts_b = params/ sd_b
 p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-1))) for i in ts_b]
 sd_b = np.round(sd_b,3)
 ts_b = np.round(ts_b,3)
 p_values = np.round(p_values,5)
 params = np.round(params,4)
 myDF1 = pd.DataFrame()
 myDF1["Coefficients"],myDF1["Standard Errors"],myDF1["t values"],myDF1["P-values"] = [params,sd_b,ts_b,p_values]
 print(myDF1)

我是这个初学者,如果编码有问题,我也愿意接受其他评论?

0 个答案:

没有答案