如何从现有的输入数据中进行多元线性回归

2024-06-16 09:08:24 发布

您现在位置:Python中文网/ 问答频道 /正文

我需要为预先存在的输入数据创建新模型。这是目前从电子表格中获取的数据。然后我需要用一张保留表来测试这个代码。我如何使用它来创建多线性回归?我还可以从中创建哪些类型

from sklearn.decomposition import PCA
from sklearn import model_selection
#from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale


import numpy as np
import pandas as pd
import datetime
import time


from datetime import datetime

from datetime import datetime
# from pandas import ExcelWriter

def DTFormatOpt(s,flist):
    for f in flist:
        try:
            return datetime.strptime(s,f)
        except ValueError:
            pass




def Input_PreProcessor(IntRate_whole_data):
# Removing BS information
R_Drop = IntRate_whole_data.drop(['X2', 'X3', 'X8', 'X16', 'X18', 'X19', 'X20'], axis=1)

# Converting the columns with dollar and percentage sign to floats
R_Drop[['X4', 'X5', 'X6']] = R_Drop[['X4', 'X5', 'X6']].replace('[\$,]', '', regex=True).astype(
    float)
R_Drop[['X1', 'X30']] = R_Drop[['X1', 'X30']].replace('[,\%]', '', regex=True).astype(float)

# Variable X9 were converted to numbers to take into account the ordinal relationship among the values
R_Drop['X9'] = R_Drop['X9'].rank(method='dense')

# Handling the missing values by replacing them with median if continuous and by mode if categorical
R_Drop['X26'].fillna(0, inplace=True)
R_Drop['X25'].fillna(0, inplace=True)
R_Drop.fillna(R_Drop.median()['X4':], inplace=True)
Ctg_Ind_Miss = ['X7', 'X11', 'X12', 'X14', 'X15', 'X17', 'X23', 'X32']
R_Drop[Ctg_Ind_Miss] = R_Drop[Ctg_Ind_Miss].apply(lambda x: x.fillna(x.value_counts().index[0]))

# Handling categorical features by converting them to binary dummy variables
Ctg_Ind = ['X7', 'X12', 'X14', 'X17', 'X32']
IntRate_Dummies = pd.get_dummies(R_Drop[Ctg_Ind], drop_first=True)
IntRate_NoMiss = R_Drop.join(IntRate_Dummies)
IntRate_NoMiss = IntRate_NoMiss.drop(Ctg_Ind, axis=1)

# Transforming some of the variables
# Variable X5 were subtracted from variable X4
IntRate_NoMiss['X5'] = IntRate_NoMiss['X4'] - IntRate_NoMiss['X5']

# Variable X15 were categorized to 4 quarters and then binarized.
IntRate_NoMiss['X15'] = pd.to_datetime(IntRate_NoMiss['X15'], errors='coerce', format='%b-%d')
IntRate_NoMiss['X15'] = IntRate_NoMiss['X15'].dt.quarter
IssueDate_Dummies = pd.get_dummies(IntRate_NoMiss['X15'], drop_first=True)
IntRate_NoMiss = IntRate_NoMiss.join(IssueDate_Dummies)
IntRate_NoMiss = IntRate_NoMiss.drop(['X15'], axis=1)

# Variable X23 were subtracted from the most recent credit line which were opened among all the borrowers () to denote the relative duration of borrowers having credit lines.
Flist = ['%b-%y', '%d-%b']
IntRate_NoMiss['X23'] = IntRate_NoMiss['X23'].apply(lambda x: DTFormatOpt(str(x), Flist))
IntRate_NoMiss['X23'] = IntRate_NoMiss['X23'].map(
    lambda dt: dt.replace(year=2001) if dt.year == 1900 else dt.replace(year=dt.year))
IntRate_NoMiss['X23'] = IntRate_NoMiss['X23'].map(
    lambda dt: dt.replace(year=dt.year - 100) if dt.year > 2020 else dt.replace(year=dt.year))
Most_Recent_Date = IntRate_NoMiss['X23'].max()
Days_CreditLine = Most_Recent_Date - IntRate_NoMiss['X23']
IntRate_NoMiss['X23'] = Days_CreditLine.dt.days.astype(float)

# Variable X11 were converted to floats:
# Variable X11 for Customers with work experience less than 1 year who have also missing values for their "employer or job title" were replaced with 0.
# Variable X11 for Customers with work experience less than 1 year who have values for their "employer or job title" were replaced with 1.
# Variable X11 for Customers with work experience more than 10 years were replaced with 15 years of experience as an average.
IntRate_NoMiss['X11'] = IntRate_NoMiss['X11'].replace('[,years]' or '[,year]', '', regex=True).replace('10\+', '15',
                                                                                                       regex=True)
IntRate_NoMiss.ix[
    (IntRate_NoMiss['X10'].isnull()) & (IntRate_NoMiss['X11'].str.contains('< 1').astype('bool')), 'X11'] = '0'
IntRate_NoMiss.ix[
    (IntRate_NoMiss['X10'].notnull()) & (IntRate_NoMiss['X11'].str.contains('< 1').astype('bool')), 'X11'] = '1'
IntRate_NoMiss['X11'] = pd.to_numeric(IntRate_NoMiss['X11'], errors='coerce')
IntRate_Final = IntRate_NoMiss.drop(['X10'], axis=1)
IntRate_Final.fillna(IntRate_Final.median()['X4':], inplace=True)
IntRate_Train_Final = IntRate_Final.ix['x']
IntRate_Test_Final = IntRate_Final.ix['y']
# assert (IntRate_Train_Final.shape[0]-IntRate_Train_Final.dropna().shape[0]) == 0,'The training dataset still has missing values'

return (IntRate_Train_Final, IntRate_Test_Final)

我还提供了一个支持向量回归来演示如何使用一个模型的数据

def First_Model_SVR(Scaled_Input_Data, Output_Data):
    n = len(Scaled_Input_Data)
    # cv_ss = model_selection.ShuffleSplit(n, n_iter=3, train_size=0.0005, test_size=0.0001)
    Grid_Dict = {"C": [1e-1, 1e0, 1e1], "gamma": np.logspace(-2, 1, 3)}
    svr_Tuned = GridSearchCV(SVR(kernel='rbf', gamma=0.1, tol=0.05), cv=5, param_grid=Grid_Dict,
                             scoring="neg_mean_squared_error")
    MeanMSE_SVR = 1
    svr_Tuned.fit(Scaled_Input_Data, Output_Data)
    T0 = time.time()
    SVR_MSE = SVR(kernel='rbf', C=svr_Tuned.best_params_['C'], gamma=svr_Tuned.best_params_['gamma'], tol=0.01)
    SVR_Time = time.time() - T0
    print('The computational time of Radial based Support Vector Regression for ', n, ' examples is: ', SVR_Time / 10)
    MSEs_SVR = model_selection.cross_val_score(SVR_MSE, Scaled_Input_Data, Output_Data, cv=10,
                                                scoring="neg_mean_squared_error")
    MeanMSE_SVR = np.mean(list(MSEs_SVR))
    print('The average MSE of Radial based Support Vector Regression for ', n, ' examples is: ', MeanMSE_SVR)
    return (MeanMSE_SVR, svr_Tuned)


def SVR_Predictor(svr_Tuned, Input_test_Data, Address_test):
    Predicted_SVR = svr_Tuned.predict(Input_test_Data)
    Predicted_SVR_S = pd.Series(Predicted_SVR)
    Predicted_SVR_S.to_csv(Address_test, sep=',')
    return (Predicted_SVR)

Tags: tofromimporttruedatadtyeardrop