从训练营收集我们的函数和类。

JMI-MVM的Python项目详细描述


jmi_mvm

  • 为botcmap创建的工具集合。
  • 稍后将添加更多信息。

目录
    name="JMI_MVM"help_=" Recommended Functions to try: \n calc_roc_auc & tune_params\n plot_hist_scat_sns & multiplot\n list2df & df_drop_regex\n plot_wide_kde_thin_bar & make_violinplot\n"#functions.pyimportpandasaspdimportnumpyasnpimportmatplotlib.pyplotaspltimportmatplotlibasmplimportseabornassnsdefcalc_roc_auc(X_test,y_test,dtc,verbose=False):"""Tests the results of an already-fit classifer.     Takes X_test, y_test, classifer, verbose (True" print result)    Returns the AUC for the roc_curve as a %"""y_pred=dtc.predict(X_test)FP_rate,TP_rate,thresh=roc_curve(y_test,y_pred)roc_auc=auc(FP_rate,TP_rate)roc_auc_perc=round(roc_auc*100,3)# Your code here ifverbose:print(f"roc_curve's auc = {roc_auc_perc}%")returnroc_auc_percdeftune_params(param_name,param_values):"""Takes in param_name to tune with param_values, plots train vs test AUC's.     Returns df_results and df_style with color coded results"""res_list=[[param_name,'train_roc_auc','test_roc_auc']]# Loop through all values in param_valuesforvalueinparam_values:# Create Model, set paramsdtc_temp=DecisionTreeClassifier(criterion='entropy')params={param_name:value}dtc_temp.set_params(**params)# Fit modeldtc_temp.fit(X_train,y_train)# Get roc_auc for training datatrain_roc_auc=calc_roc_auc(X_train,y_train,dtc_temp)# Get roc_auc for test datatest_res_roc_auc=calc_roc_auc(X_test,y_test,dtc_temp)# Append value and results to res_listres_list.append([value,train_roc_auc,test_res_roc_auc])# Turn results into df_results (basically same as using list2df)df_results=pd.DataFrame(res_list[1:],columns=res_list[0])df_results.set_index(param_name,inplace=True)# Plot df_resultsdf_results.plot()# Color-coded dataframe simportseabornassnscm=sns.light_palette("green",as_cmap=True)df_syle=df_results.style.background_gradient(cmap=cm)#,low=results.min(),high=results.max())returndf_results,df_syle# MULTIPLOTfromstringimportascii_lettersimportnumpyasnpimportpandasaspdimportseabornassnsimportmatplotlib.pyplotaspltdefmultiplot(df):"""Plots results from df.corr() in a correlation heat map for multicollinearity.    Returns fig, ax objects"""sns.set(style="white")# Compute the correlation matrixcorr=df.corr()# Generate a mask for the upper trianglemask=np.zeros_like(corr,dtype=np.bool)mask[np.triu_indices_from(mask)]=True# Set up the matplotlib figuref,ax=plt.subplots(figsize=(16,16))# Generate a custom diverging colormapcmap=sns.diverging_palette(220,10,as_cmap=True)# Draw the heatmap with the mask and correct aspect ratiosns.heatmap(corr,mask=mask,annot=True,cmap=cmap,center=0,square=True,linewidths=.5,cbar_kws={"shrink":.5})returnf,ax# Plots histogram and scatter (vs price) side by side# Plots histogram and scatter (vs price) side by sidedefplot_hist_scat_sns(df,target='index'):"""Plots seaborne distplots and regplots for columns im datamframe vs target.    Parameters:    df (DataFrame): DataFrame.describe() columns will be used.     target = name of column containing target variable.assume first coluumn.     Returns:    Figures for each column vs target with 2 subplots.   """importmatplotlib.tickerasmtickimportmatplotlib.pyplotaspltimportseabornassnswithplt.style.context(('dark_background')):###  DEFINE AESTHETIC CUSTOMIZATIONS  -------------------------------###         plt.style.use('dark_background')figsize=(9,7)# Axis Label fontsfontTitle={'fontsize':14,'fontweight':'bold','fontfamily':'serif'}fontAxis={'fontsize':12,'fontweight':'medium','fontfamily':'serif'}fontTicks={'fontsize':8,'fontweight':'medium','fontfamily':'serif'}# Formatting dollar sign labelsfmtPrice='${x:,.0f}'tickPrice=mtick.StrMethodFormatter(fmtPrice)###  PLOTTING ----------------------------- ------------------------ ### Loop through dataframe to plotforcolumnindf.describe():#             print(f'\nCurrent column: {column}')# Create figure with subplots for current columnfig,ax=plt.subplots(figsize=figsize,ncols=2,nrows=2)##  SUBPLOT 1 --------------------------------------------------##i,j=0,0ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)# Define graphing keyword dictionaries for distplot (Subplot 1)hist_kws={"linewidth":1,"alpha":1,"color":'blue','edgecolor':'w'}kde_kws={"color":"white","linewidth":1,"label":"KDE"}# Plot distplot on ax[i,j] using hist_kws and kde_kwssns.distplot(df[column],norm_hist=True,kde=True,hist_kws=hist_kws,kde_kws=kde_kws,label=column+' histogram',ax=ax[i,j])# Set x axis labelax[i,j].set_xlabel(column.title(),fontdict=fontAxis)# Get x-ticks, rotate labels, and returnxticklab1=ax[i,j].get_xticklabels(which='both')ax[i,j].set_xticklabels(labels=xticklab1,fontdict=fontTicks,rotation=0)ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())# Set y-label ax[i,j].set_ylabel('Density',fontdict=fontAxis)yticklab1=ax[i,j].get_yticklabels(which='both')ax[i,j].set_yticklabels(labels=yticklab1,fontdict=fontTicks)ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())# Set y-gridax[i,j].set_axisbelow(True)ax[i,j].grid(axis='y',ls='--')##  SUBPLOT 2-------------------------------------------------- ##i,j=0,1ax[i,j].set_title(column.capitalize(),fontdict=fontTitle)# Define the kwd dictionaries for scatter and regression line (subplot 2)line_kws={"color":"white","alpha":0.5,"lw":4,"ls":":"}scatter_kws={'s':2,'alpha':0.5,'marker':'.','color':'blue'}# Plot regplot on ax[i,j] using line_kws and scatter_kwssns.regplot(df[column],df[target],line_kws=line_kws,scatter_kws=scatter_kws,ax=ax[i,j])# Set x-axis labelax[i,j].set_xlabel(column.title(),fontdict=fontAxis)# Get x ticks, rotate labels, and returnxticklab2=ax[i,j].get_xticklabels(which='both')ax[i,j].set_xticklabels(labels=xticklab2,fontdict=fontTicks,rotation=0)ax[i,j].xaxis.set_major_formatter(mtick.ScalarFormatter())# Set  y-axis labelax[i,j].set_ylabel(target,fontdict=fontAxis)# Get, set, and format y-axis Price labelsyticklab=ax[i,j].get_yticklabels()ax[i,j].set_yticklabels(yticklab,fontdict=fontTicks)ax[i,j].yaxis.set_major_formatter(mtick.ScalarFormatter())#         ax[i,j].get_yaxis().set_major_formatter(tickPrice) # Set y-gridax[i,j].set_axisbelow(True)ax[i,j].grid(axis='y',ls='--')## ---------- Final layout adjustments ----------- ### Deleted unused subplots fig.delaxes(ax[1,1])fig.delaxes(ax[1,0])# Optimizing spatial layoutfig.tight_layout()figtitle=column+'_dist_regr_plots.png'#             plt.savefig(figtitle)return# Tukey's method using IQR to eliminate defdetect_outliers(df,n,features):"""Uses Tukey's method to return outer of interquartile ranges to return indices if outliers in a dataframe.    Parameters:    df (DataFrame): DataFrane containing columns of features    n: default is 0, multiple outlier cutoff      Returns:    Index of outliers for .loc    Examples:    Outliers_to_drop = detect_outliers(data,2,["col1","col2"]) Returning value    df.loc[Outliers_to_drop] # Show the outliers rows    data= data.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)"""# Drop outliers    outlier_indices=[]# iterate over features(columns)forcolinfeatures:# 1st quartile (25%)Q1=np.percentile(df[col],25)# 3rd quartile (75%)Q3=np.percentile(df[col],75)# Interquartile range (IQR)IQR=Q3-Q1# outlier stepoutlier_step=1.5*IQR# Determine a list of indices of outliers for feature coloutlier_list_col=df[(df[col]<Q1-outlier_step)|(df[col]>Q3+outlier_step)].index# append the found outlier indices for col to the list of outlier indices outlier_indices.extend(outlier_list_col)# select observations containing more than 2 outliersoutlier_indices=Counter(outlier_indices)multiple_outliers=list(kfork,vinoutlier_indices.items()ifv>n)returnmultiple_outliers# describe_outliers -- calls detect_outliersdefdescribe_outliers(df):""" Returns a new_df of outliers, and % outliers each col using detect_outliers.    """out_count=0new_df=pd.DataFrame(columns=['total_outliers','percent_total'])forcolindf.columns:outies=detect_outliers(df[col])out_count+=len(outies)new_df.loc[col]=[len(outies),round((len(outies)/len(df.index))*100,2)]new_df.loc['grand_total']=[sum(new_df['total_outliers']),sum(new_df['percent_total'])]returnnew_df#### Cohen's ddefCohen_d(group1,group2):'''Compute Cohen's d.    # group1: Series or NumPy array    # group2: Series or NumPy array    # returns a floating point number     '''diff=group1.mean()-group2.mean()n1,n2=len(group1),len(group2)var1=group1.var()var2=group2.var()# Calculate the pooled threshold as shown earlierpooled_var=(n1*var1+n2*var2)/(n1+n2)# Calculate Cohen's d statisticd=diff/np.sqrt(pooled_var)returnddefplot_pdfs(cohen_d=2):"""Plot PDFs for distributions that differ by some number of stds.    cohen_d: number of standard deviations between the means    """group1=scipy.stats.norm(0,1)group2=scipy.stats.norm(cohen_d,1)xs,ys=evaluate_PDF(group1)pyplot.fill_between(xs,ys,label='Group1',color='#ff2289',alpha=0.7)xs,ys=evaluate_PDF(group2)pyplot.fill_between(xs,ys,label='Group2',color='#376cb0',alpha=0.7)o,s=overlap_superiority(group1,group2)print('overlap',o)print('superiority',s)deflist2df(list):#, sort_values='index'):""" Take in a list where row[0] = column_names and outputs a dataframe.    Keyword arguments:    set_index -- df.set_index(set_index)    sortby -- df.sorted()    """df_list=pd.DataFrame(list[1:],columns=list[0])#     df_list = df_list[1:]returndf_listdefdf_drop_regex(DF,regex_list):'''Use a list of regex to remove columns names. Returns new df.    Parameters:        DF -- input dataframe to remove columns from.        regex_list -- list of string patterns or regexp to remove.    Returns:        df_cut -- input df without the dropped columns.         '''df_cut=DF.copy()forrinregex_list:df_cut=df_cut[df_cut.columns.drop(list(df_cut.filter(regex=r)))]print(f'Removed {r}\n')returndf_cut####### MIKE's PLOTTING# plotting order totals per month in violin plotsdefmake_violinplot(x,y,title=None,hue=None,ticklabels=None):'''Plots a violin plot with horizontal mean line, inner stick lines'''plt.style.use('dark_background')fig,ax=plt.subplots(figsize=(12,10))sns.violinplot(x,y,cut=2,split=True,scale='count',scale_hue=True,saturation=.5,alpha=.9,bw=.25,palette='Dark2',inner='stick',hue=hue).set_title(title)ax.axhline(y.mean(),label='total mean',ls=':',alpha=.5,color='xkcd:yellow')ax.set_xticklabels(ticklabels)plt.legend()plt.show()x=df_year_orders['month']y=df_year_orders['order_total']title='Order totals per month with or without discounts'hue=df_year_orders['Discount']>0### Example usage# #First, declare variables to be plotted# x = df_year_orders['month']# y = df_year_orders['order_total']# ticks = [v for v in month_dict.values()] # title = 'Order totals per month with or without discounts'# hue = df_year_orders['Discount']>0### Then call function# make_violinplot(x,y,title,hue, ticks), ###########defplot_wide_kde_thin_bar(series1,sname1,series2,sname2):'''Plot series1 and series 2 on wide kde plot with small mean+sem bar plot.'''## ADDING add_gridspec usageimportpandasaspdimportnumpyasnpfromscipy.statsimportsemimportmatplotlib.pyplotaspltimportmatplotlibasmplimportmatplotlib.tickerastickerimportseabornassnsfrommatplotlibimportrcParamsfrommatplotlibimportrcrcParams['font.family']='serif'# Plot distributions of discounted vs full price groupsplt.style.use('default')# with plt.style.context(('tableau-colorblind10')):withplt.style.context(('seaborn-notebook')):## ----------- DEFINE AESTHETIC CUSTOMIZATIONS ----------- ### Axis Label fontsfontSuptitle={'fontsize':22,'fontweight':'bold','fontfamily':'serif'}fontTitle={'fontsize':10,'fontweight':'medium','fontfamily':'serif'}fontAxis={'fontsize':10,'fontweight':'medium','fontfamily':'serif'}fontTicks={'fontsize':8,'fontweight':'medium','fontfamily':'serif'}## --------- CREATE FIG BASED ON GRIDSPEC --------- ##plt.suptitle('Quantity of Units Sold',fontdict=fontSuptitle)# Create fig object and declare figsizefig=plt.figure(constrained_layout=True,figsize=(8,3))# Define gridspec to create grid coordinates             gs=fig.add_gridspec(nrows=1,ncols=10)# Assign grid space to ax with add_subplotax0=fig.add_subplot(gs[0,0:7])ax1=fig.add_subplot(gs[0,7:10])#Combine into 1 listax=[ax0,ax1]### ------------------  SUBPLOT 1  ------------------ ##### --------- Defining series1 and 2 for subplot 1------- ##ax[0].set_title('Histogram + KDE',fontdict=fontTitle)# Group 1: data, label, hist_kws and kde_kwsplotS1={'data':series1,'label':sname1.title(),'hist_kws':{'edgecolor':'black','color':'darkgray','alpha':0.8,'lw':0.5},'kde_kws':{'color':'gray','linestyle':'--','linewidth':2,'label':'kde'}}# Group 2: data, label, hist_kws and kde_kwsplotS2={'data':series2,'label':sname2.title(),'hist_kws':{'edgecolor':'black','color':'green','alpha':0.8,'lw':0.5},'kde_kws':{'color':'darkgreen','linestyle':':','linewidth':3,'label':'kde'}}# plot group 1sns.distplot(plotS1['data'],label=plotS1['label'],hist_kws=plotS1['hist_kws'],kde_kws=plotS1['kde_kws'],ax=ax[0])# plot group 2sns.distplot(plotS2['data'],label=plotS2['label'],hist_kws=plotS2['hist_kws'],kde_kws=plotS2['kde_kws'],ax=ax[0])ax[0].set_xlabel(series1.name,fontdict=fontAxis)ax[0].set_ylabel('Kernel Density Estimation',fontdict=fontAxis)ax[0].tick_params(axis='both',labelsize=fontTicks['fontsize'])ax[0].legend()### ------------------  SUBPLOT 2  ------------------ #### Import scipy for error barsfromscipy.statsimportsem# Declare x y group labels(x) and bar heights(y)x=[plotS1['label'],plotS2['label']]y=[np.mean(plotS1['data']),np.mean(plotS2['data'])]yerr=[sem(plotS1['data']),sem(plotS2['data'])]err_kws={'ecolor':'black','capsize':5,'capthick':1,'elinewidth':1}# Create the bar plotax[1].bar(x,y,align='center',edgecolor='black',yerr=yerr,error_kw=err_kws,width=0.6)# Customize subplot 2ax[1].set_title('Average Quantities Sold',fontdict=fontTitle)ax[1].set_ylabel('Mean +/- SEM ',fontdict=fontAxis)ax[1].set_xlabel('')ax[1].tick_params(axis=y,labelsize=fontTicks['fontsize'])ax[1].tick_params(axis=x,labelsize=fontTicks['fontsize'])ax1=ax[1]test=ax1.get_xticklabels()labels=[x.get_text()forxintest]ax1.set_xticklabels([plotS1['label'],plotS2['label']],rotation=45,ha='center')#         xlab = [x.get_text() for x in xlablist]#         ax[1].set_xticklabels(xlab,rotation=45)#         fig.savefig('H1_EDA_using_gridspec.png')#         plt.tight_layout()#     print(f')plt.show()returnfig,ax

    欢迎加入QQ群-->: 979659372 Python中文网_新手群

    推荐PyPI第三方库


    热门话题
    javajexcel包装文本问题   EclipseJavaEnum缩进超过左括号。如何让它看起来更正常?   java有办法包含Tomcat 6 catalina。out和localhost。在网络应用的日志文件中记录内容?   java如何永久性地阻止JavaFX代码在eclipse中被突出显示为错误?   如何在java中优化两个for循环(for循环中的for循环)   java如何在我的windows机器上从jar文件创建mac osx的可执行文件   使用记忆化/动态规划的Java组合学   Java中的游荡对象垃圾收集   java为什么我在JSP和JDBC和MySQL中遇到连接失败错误   java轮询Pod的就绪状态   如何创建电子邮件并将其发送到Java中的特定地址?   java如何修复Dagger 2错误“。。。无法提供[…]”?   java Android单选按钮看起来太轻   Android Studio:开发在应用程序之间共享的通用java库