如何从现有的输入数据中进行多元线性回归

from sklearn.decomposition import PCA from sklearn import model_selection #from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from sklearn.svm import SVR from sklearn.kernel_ridge import KernelRidge from sklearn.model_selection import train_test_split from sklearn.preprocessing import scale import numpy as np import pandas as pd import datetime import time from datetime import datetime from datetime import datetime # from pandas import ExcelWriter def DTFormatOpt(s,flist): for f in flist: try: return datetime.strptime(s,f) except ValueError: pass def Input_PreProcessor(IntRate_whole_data): # Removing BS information R_Drop = IntRate_whole_data.drop(['X2', 'X3', 'X8', 'X16', 'X18', 'X19', 'X20'], axis=1) # Converting the columns with dollar and percentage sign to floats R_Drop[['X4', 'X5', 'X6']] = R_Drop[['X4', 'X5', 'X6']].replace('[\$,]', '', regex=True).astype( float) R_Drop[['X1', 'X30']] = R_Drop[['X1', 'X30']].replace('[,\%]', '', regex=True).astype(float) # Variable X9 were converted to numbers to take into account the ordinal relationship among the values R_Drop['X9'] = R_Drop['X9'].rank(method='dense') # Handling the missing values by replacing them with median if continuous and by mode if categorical R_Drop['X26'].fillna(0, inplace=True) R_Drop['X25'].fillna(0, inplace=True) R_Drop.fillna(R_Drop.median()['X4':], inplace=True) Ctg_Ind_Miss = ['X7', 'X11', 'X12', 'X14', 'X15', 'X17', 'X23', 'X32'] R_Drop[Ctg_Ind_Miss] = R_Drop[Ctg_Ind_Miss].apply(lambda x: x.fillna(x.value_counts().index[0])) # Handling categorical features by converting them to binary dummy variables Ctg_Ind = ['X7', 'X12', 'X14', 'X17', 'X32'] IntRate_Dummies = pd.get_dummies(R_Drop[Ctg_Ind], drop_first=True) IntRate_NoMiss = R_Drop.join(IntRate_Dummies) IntRate_NoMiss = IntRate_NoMiss.drop(Ctg_Ind, axis=1) # Transforming some of the variables # Variable X5 were subtracted from variable X4 IntRate_NoMiss['X5'] = IntRate_NoMiss['X4'] - IntRate_NoMiss['X5'] # Variable X15 were categorized to 4 quarters and then binarized. IntRate_NoMiss['X15'] = pd.to_datetime(IntRate_NoMiss['X15'], errors='coerce', format='%b-%d') IntRate_NoMiss['X15'] = IntRate_NoMiss['X15'].dt.quarter IssueDate_Dummies = pd.get_dummies(IntRate_NoMiss['X15'], drop_first=True) IntRate_NoMiss = IntRate_NoMiss.join(IssueDate_Dummies) IntRate_NoMiss = IntRate_NoMiss.drop(['X15'], axis=1) # Variable X23 were subtracted from the most recent credit line which were opened among all the borrowers () to denote the relative duration of borrowers having credit lines. Flist = ['%b-%y', '%d-%b'] IntRate_NoMiss['X23'] = IntRate_NoMiss['X23'].apply(lambda x: DTFormatOpt(str(x), Flist)) IntRate_NoMiss['X23'] = IntRate_NoMiss['X23'].map( lambda dt: dt.replace(year=2001) if dt.year == 1900 else dt.replace(year=dt.year)) IntRate_NoMiss['X23'] = IntRate_NoMiss['X23'].map( lambda dt: dt.replace(year=dt.year - 100) if dt.year > 2020 else dt.replace(year=dt.year)) Most_Recent_Date = IntRate_NoMiss['X23'].max() Days_CreditLine = Most_Recent_Date - IntRate_NoMiss['X23'] IntRate_NoMiss['X23'] = Days_CreditLine.dt.days.astype(float) # Variable X11 were converted to floats: # Variable X11 for Customers with work experience less than 1 year who have also missing values for their "employer or job title" were replaced with 0. # Variable X11 for Customers with work experience less than 1 year who have values for their "employer or job title" were replaced with 1. # Variable X11 for Customers with work experience more than 10 years were replaced with 15 years of experience as an average. IntRate_NoMiss['X11'] = IntRate_NoMiss['X11'].replace('[,years]' or '[,year]', '', regex=True).replace('10\+', '15', regex=True) IntRate_NoMiss.ix[ (IntRate_NoMiss['X10'].isnull()) & (IntRate_NoMiss['X11'].str.contains('< 1').astype('bool')), 'X11'] = '0' IntRate_NoMiss.ix[ (IntRate_NoMiss['X10'].notnull()) & (IntRate_NoMiss['X11'].str.contains('< 1').astype('bool')), 'X11'] = '1' IntRate_NoMiss['X11'] = pd.to_numeric(IntRate_NoMiss['X11'], errors='coerce') IntRate_Final = IntRate_NoMiss.drop(['X10'], axis=1) IntRate_Final.fillna(IntRate_Final.median()['X4':], inplace=True) IntRate_Train_Final = IntRate_Final.ix['x'] IntRate_Test_Final = IntRate_Final.ix['y'] # assert (IntRate_Train_Final.shape[0]-IntRate_Train_Final.dropna().shape[0]) == 0,'The training dataset still has missing values' return (IntRate_Train_Final, IntRate_Test_Final)

def First_Model_SVR(Scaled_Input_Data, Output_Data): n = len(Scaled_Input_Data) # cv_ss = model_selection.ShuffleSplit(n, n_iter=3, train_size=0.0005, test_size=0.0001) Grid_Dict = {"C": [1e-1, 1e0, 1e1], "gamma": np.logspace(-2, 1, 3)} svr_Tuned = GridSearchCV(SVR(kernel='rbf', gamma=0.1, tol=0.05), cv=5, param_grid=Grid_Dict, scoring="neg_mean_squared_error") MeanMSE_SVR = 1 svr_Tuned.fit(Scaled_Input_Data, Output_Data) T0 = time.time() SVR_MSE = SVR(kernel='rbf', C=svr_Tuned.best_params_['C'], gamma=svr_Tuned.best_params_['gamma'], tol=0.01) SVR_Time = time.time() - T0 print('The computational time of Radial based Support Vector Regression for ', n, ' examples is: ', SVR_Time / 10) MSEs_SVR = model_selection.cross_val_score(SVR_MSE, Scaled_Input_Data, Output_Data, cv=10, scoring="neg_mean_squared_error") MeanMSE_SVR = np.mean(list(MSEs_SVR)) print('The average MSE of Radial based Support Vector Regression for ', n, ' examples is: ', MeanMSE_SVR) return (MeanMSE_SVR, svr_Tuned) def SVR_Predictor(svr_Tuned, Input_test_Data, Address_test): Predicted_SVR = svr_Tuned.predict(Input_test_Data) Predicted_SVR_S = pd.Series(Predicted_SVR) Predicted_SVR_S.to_csv(Address_test, sep=',') return (Predicted_SVR)

0条回答

目前没有回答

相关问题更多 >

编程相关推荐

热门问题

热门文章