分析YouTube视频进行一般情绪分析
youtube-sentiment的Python项目详细描述
YouTube情感助手
通过基于正面/负面情绪的视频评论分析,使用SCI工具包根据评论分析确定YouTube视频的情绪。 帮助工具向机器学习模型发出请求,以便使用YouTube API确定情绪。
安装说明
pip install .
或pypi(https://pypi.org/project/youtube-sentiment/)
pip install youtube-sentiment
如何使用
当前使用情况:
import youtube_sentiment as yt
yt.video_summary(<Youtube API Key>, <Youtube Video ID>, <Max Pages of Comments>, <Sentiment Model>)
或
python main.py <Youtube API Key> <Youtube Video ID> <Max Pages of Comments> <Sentiment Model>
在项目路径./models
测试
python setup.py test
待办事项
- [X]创建API以通过rest使用YouTube API v3获取视频评论
- [X]创建初始python包 [LI> [X]分析现有的情感分析模型,选择和使用
- 提高现有情感学习模式
- []建立情感的深层模型
- [X]利用情绪分析来分析YouTube视频并提供分析
- [X]完成项目的python包
- []修复任何新的错误
- []创建基于Web的门户
提供的型号
- lr_emotion_basic(基本向量器/逻辑回归模型,2 MB)
- lr_emotion_cv(具有干净数据集的超调谐tfidf/logistic回归模型,60mb)
- 待添加cnn嫒u情绪(卷积神经网络模型)
- 待添加cnn嫒u情绪(ltsm神经网络模型)
传统的ml模型创建
为什么要使用twitter情感作为培训?
tldr:对于传统模型,引导是最简单、最有效的方法
# Develop sentiment analysis classifier using traditional ML models# Pipeline modeling using the following guide: # https://ryan-cranfill.github.io/sentiment-pipeline-sklearn-1/# Data processing and cleaning guide:# https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90# Importsimportnumpyasnpimporttimeimportpandasaspdimportmatplotlib.pyplotaspltimportrefrombs4importBeautifulSoupimportnltkfromsklearn.feature_extraction.textimportTfidfVectorizerfromsklearn.linear_modelimportLogisticRegressionfromsklearn.metricsimportaccuracy_score,log_loss,confusion_matrix,auc,roc_curvefromsklearn.model_selectionimportGridSearchCVfromsklearn.pipelineimportFeatureUnion,Pipelinefromsklearn.externalsimportjoblibfromsklearn.model_selectionimporttrain_test_split
# Dataset of 1.6m Twitter tweetscolumns=['sentiment','id','date','query_string','user','text']train=pd.read_csv('stanford_twitter_train.csv',encoding='latin-1',header=None,names=columns)test=pd.read_csv('stanford_twitter_test.csv',encoding='latin-1',header=None,names=columns)
## Local helpers# AUC visualizationdefshow_roc(model,test,test_labels):# Predictprobs=model.predict_proba(test)preds=probs[:,1]fpr,tpr,threshold=roc_curve(test_labels,preds)roc_auc=auc(fpr,tpr)# Chartplt.title('Receiver Operating Characteristic')plt.plot(fpr,tpr,'b',label='AUC = %0.2f'%roc_auc)plt.legend(loc='lower right')plt.plot([0,1],[0,1],'r--')plt.xlim([0,1])plt.ylim([0,1])plt.ylabel('True Positive Rate')plt.xlabel('False Positive Rate')plt.show()# Tweet cleansertok=nltk.tokenize.WordPunctTokenizer()pat1=r'@[A-Za-z0-9_]+'pat2=r'https?://[^ ]+'combined_pat=r'|'.join((pat1,pat2))www_pat=r'www.[^ ]+'negations_dic={"isn't":"is not","aren't":"are not","wasn't":"was not","weren't":"were not","haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not","wouldn't":"would not","don't":"do not","doesn't":"does not","didn't":"did not","can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not","mustn't":"must not"}neg_pattern=re.compile(r'\b('+'|'.join(negations_dic.keys())+r')\b')defclean_tweet(text):soup=BeautifulSoup(text,'lxml')souped=soup.get_text()try:bom_removed=souped.decode("utf-8-sig").replace(u"\ufffd","?")except:bom_removed=soupedstripped=re.sub(combined_pat,'',bom_removed)stripped=re.sub(www_pat,'',stripped)lower_case=stripped.lower()neg_handled=neg_pattern.sub(lambdax:negations_dic[x.group()],lower_case)letters_only=re.sub("[^a-zA-Z]"," ",neg_handled)# During the letters_only process two lines above, it has created unnecessay white spaces,# I will tokenize and join together to remove unneccessary white spaceswords=[xforxintok.tokenize(letters_only)iflen(x)>1]return(" ".join(words)).strip()
# Data cleaningcleaned_tweets=[]fortweetintrain['text']:cleaned_tweets.append(clean_tweet(tweet))cleaned_df=pd.DataFrame(cleaned_tweets,columns=['text'])cleaned_df['target']=train.sentimentcleaned_df.target[cleaned_df.target==4]=1# rename 4 to 1 as positive labelcleaned_df=cleaned_df[cleaned_df.target!=2]# remove neutral labelscleaned_df=cleaned_df.dropna()# drop null recordscleaned_df.to_csv('stanford_clean_twitter_train.csv',encoding='utf-8')
# Starting point from importcsv='stanford_clean_twitter_train.csv'df=pd.read_csv(csv,index_col=0)
# Random shuffle and ensure no null recordsdf=df.sample(frac=1).reset_index(drop=True)df=df.dropna()# drop null records
X,y=df.text[0:200000],df.target[0:200000]# Max data size 200k for memory purposesX_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.10)
# Dataset shapes post-splitprint(np.shape(X_train))print(np.shape(X_test))print(np.unique(y_train))
(180000,)
(20000,)
[0 1]
# NLTK Twitter tokenizer best used for short comment-type text setsimportnltktokenizer=nltk.casual.TweetTokenizer(preserve_case=False)
# Hyperparameter tuning (Simple model)#cvect = CountVectorizer(tokenizer=tokenizer.tokenize)tfidf=TfidfVectorizer()clf=LogisticRegression()pipeline=Pipeline([('tfidf',tfidf),('clf',clf)])parameters={'tfidf__ngram_range':[(1,1),(1,2),(1,3)],# ngram range of tokenizer'tfidf__norm':['l1','l2',None],# term vector normalization'tfidf__max_df':[0.25,0.5,1.0],# maximum document frequency for the CountVectorizer'clf__C':np.logspace(-2,0,3)# C value for the LogisticRegression}grid=GridSearchCV(pipeline,parameters,cv=3,verbose=1)print("Performing grid search...")print("pipeline:",[nameforname,_inpipeline.steps])t0=time.time()grid.fit(X_train,y_train)print("done in %0.3fs"%(time.time()-t0))print()print("Best score: %0.3f"%grid.best_score_)print("Best parameters set:")best_parameters=grid.best_estimator_.get_params()forparam_nameinsorted(parameters.keys()):print("\t%s: %r"%(param_name,best_parameters[param_name]))
Performing grid search...
pipeline: ['tfidf', 'clf']
Fitting 3 folds for each of 81 candidates, totalling 243 fits
[Parallel(n_jobs=1)]: Done 243 out of 243 | elapsed: 52.7min finished
done in 3186.295s
Best score: 0.803
Best parameters set:
clf__C: 0.01
tfidf__max_df: 0.25
tfidf__ngram_range: (1, 3)
tfidf__norm: None
# Dump model from grid search cvjoblib.dump(grid.best_estimator_,'lr_sentiment_cv.pkl',compress=1)
['lr_sentiment_cv.pkl']
# Starting point 2: Post-model load comparisonlra=joblib.load('./Models/Stanford_Twitter_Models/lr_sentiment_cv.pkl')lrb=joblib.load('./Models/Twitter_Simple_Models/lr_sentiment_basic.pkl')
# Model performance indicators for basic modely_pred_basic=lrb.predict(X_test)print(confusion_matrix(y_test,y_pred_basic))show_roc(lrb,X_test,y_test)# AUC
[[7562 2347]
[2181 7910]]
# Model performance indicators for hypertuned modely_pred_hyper=lra.predict(X_test)print(confusion_matrix(y_test,y_pred_hyper))show_roc(lra,X_test,y_test)# AUC
[[7861 2048]
[1863 8228]]
print(lrb.predict(["terrible idea why was this even made"]))print(lrb.predict(["that was the best movie ever"]))
[0]
[1]