dataset = pd.read_csv('ll.csv',index_col=0)
dataset = dataset.dropna(axis=0)
# features or independent variables
x = pd.DataFrame()
x['Skills'] = dataset['Skills']
x['Location'] = dataset['Location']
x['Industry'] = dataset['Industry']
x['Experience'] = dataset['Experience']
# applying hashing
x_hash = copy.copy(x)
for i in range(x_hash.shape[1]):
x_hash.iloc[:,i] = x_hash.iloc[:,i].astype('str')
x_hash = h.transform(x_hash.values)
#Dependent Variable
y=pd.DataFrame()
y['Functional Area'] = dataset['Functional Area']
y_hash = copy.copy(y)
for i in range(y_hash.shape[1]):
y_hash.iloc[:,i] = y_hash.iloc[:,i].astype('str')
y_hash = h.transform(y_hash.values)
# Regressor
regressor = DecisionTreeRegressor(random_state=0)
ll = regressor.fit(x_hash.toarray(),y_hash.toarray())
# For predicting input features
input_df = pd.DataFrame()
input_df['Skills'] = ['Illustrator']
input_df['Experience'] = ['1-6']
input_df['Industry'] = ['IT - Software Services']
input_df['Location'] = ['Cairo-Egypt']
input_df_hash = copy.copy(input_df)
for i in range(input_df_hash.shape[1]):
input_df_hash.iloc[:,i] = input_df_hash.iloc[:,i].astype('str')
input_df_hash = h.transform(input_df_hash.values)
sss=regressor.predict(input_df_hash.toarray())
Tags:
我觉得你需要稍微修改一下。在
一个可能的建议是手动散列文档中的所有字符串,并查看它们对应的列(特性)。在
一个最小的例子:
输出:
^{pr2}$注意,这种方法可以处理可能的碰撞。在
相关问题 更多 >
编程相关推荐