这些极性是由词性标注后的模式来定义的。其中一种模式包括:动词+动词+形容词includes in D(干旱相关术语)和in F(常用词)



for (w1, tag1), (w2, tag2), (w3, tag3) in nltk.trigrams(PoS_TAGS):
    if tag1.startswith("RB") and tag2.startswith("RB") and tag3.startswith("JJ"):
        tri_pairs.append((w1, w2, w3))
        if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in D:
            print("[True]: Tri Pairs are found in Drought Rel. Term")

            for j in range(len(F)):
                if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in F[j]:
                    print("[True]: Tri Pairs are found in Frequent Wordset")
                    if RES is "Positive":
                        RES = "Highly Positive"
                    elif RES is "Negative":
                        RES = "Highly Negative"
                    print "="*25,F[j]
                    print"[False]: Doesn't Match with Frequent Wordset\n"

            print"[False]: Tri Pairs Matched Nowhere in D\n"
        print "[TriPair(F)]: Pattern for Adverb, Adverb, Adjective did not match.\n Looking for Bi-Pair Patterns\n"



fuzzy_df = fuzzy_df.append({'Tweets': tweets[i], 'Classified': RES, 'FreqWord': FW}, ignore_index=True)




>>> F
['drought', 'water', 'love', 'rain', 'year', 'famine', 'farmers', 'crops', 'south', 'http', 'europe', 'scarcity', 'near', 'thought', 'ever', 'devastates', 'feed', 'message', 'eduaubdedubu', 'instant', 'italy', 'severe', 'by', 'beaches', 'wildfires', 'heat', 'us']



real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Positive
calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive
hate thought drought,Negative



FW = ''
for i in range(len(tweets)):
    sent = nltk.word_tokenize(tweets[i])
    PoS_TAGS = nltk.pos_tag(sent)

    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sia = SentimentIntensityAnalyzer()

    one_sentence = tweets.iloc[i]
    scores = sia.polarity_scores(text=one_sentence)
    print "POS:", scores.get('pos')
    print "NEG:", scores.get('neg')
    print "NEU:", scores.get('neu')

    POS = scores.get('pos')
    NEG = scores.get('neg')
    NEU = scores.get('neu')
    RES = str()

    if POS > NEG:
        RES = 'Positive'
    elif NEG > POS:
        RES = 'Negative'
    elif NEU >= 0.5 or POS > NEU:
        RES = 'Positive'
    elif NEU < 0.5:
        RES = 'Negative'

    # -------------------------------------------------------- PATTERN ADVERB, ADVERB, ADJECTIVE (Down)
    tri_pairs = list()
    for (w1, tag1), (w2, tag2), (w3, tag3) in nltk.trigrams(PoS_TAGS):
        if tag1.startswith("RB") and tag2.startswith("RB") and tag3.startswith("JJ"):
            tri_pairs.append((w1, w2, w3))
            if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in D:
                print("[True]: Tri Pairs are found in Drought Rel. Term")
                # TRIGGER AREA
                for j in range(len(F)):
                    if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in F[j]:
                        print("[True]: Tri Pairs are found in Frequent Wordset")
                        if RES is "Positive":
                            RES = "Highly Positive"
                            FW = F[j]
                            #fuzzy_df['FreqWord'].map(lambda x: next((y for y in x.split() if y in F), 'Not Found'))
                        elif RES is "Negative":
                            RES = "Highly Negative"
                            FW = F[j]
                        print"[False]: Doesn't Match with Frequent Wordset\n"

                print"[False]: Tri Pairs Matched Nowhere in D\n"

            print "[TriPair(F)]: Pattern for Adverb, Adverb, Adjective did not match.\n Looking for Bi-Pair Patterns\n"

    # -------------------------------------------------------- PATTERN ADVERB, ADJECTIVE (Down)
    bi_pairs = list()
    for (w1, tag1), (w2, tag2) in nltk.bigrams(PoS_TAGS):
        if tag1.startswith("RB") and tag2.startswith("JJ"):
            bi_pairs.append((w1, w2))

            if bi_pairs[0] or bi_pairs[1] in D:
                print("[True]: Bi Pairs are found in Drought Rel. Term")

                for k in range(len(F)):
                    if bi_pairs[0] or bi_pairs[1] is F[k]:
                        print("[True]: Bi Pairs are found in Frequent Wordset")
                        if RES is "Positive":
                            RES = "Moderately Positive"
                            FW = F[k]
                        elif RES is "Negative":
                            RES = "Moderately Negative"
                            FW = F[k]
                        print("[False]: Bi Pairs found missing in Freq. Wordset")

                print("[False]: Bi Pairs Matched Nowhere in D")

            print("[BiPair(F)]: Pattern Not Matched, Looking for Mono Pattern")

    # -------------------------------------------------------- PATTERN ADJECTIVE (Down)
    for w, tag in PoS_TAGS:
        print w, " - ", tag
        if tag.startswith("JJ"):
            if w in D:
                print("Matched with D")
                for l in range(len(F)):
                    if w is F[l]:
                        print("Matched with F")
                        if RES is "Positive":
                            RES = "Positive"
                            FW = F[l]
                        elif RES is "Negative":
                            RES = "Negative"
                            FW = F[l]
                        print("Unmatched in F")
                        FW = F[l] in sent
                print("Unmatched in D")
            print w, "is not an ADJECTIVE"

# -------------------------------------------------------- MAKING ENTRY OF RECORDS OF TWEETS and POLARITY RESULT
    fuzzy_df = fuzzy_df.append({'Tweets': tweets[i], 'Classified': RES, 'FreqWord': FW}, ignore_index=True)
fuzzy_df.to_csv("fuzzy.csv", index=False)

import io
import pandas as pd
from collections import Counter

open_strio = io.StringIO("""Tweets,Classified,FreqWord
real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Negative,
calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive,
love thought drought,Positive,
neville rooney end ever tons trophy drought,Positive,
lakes drought,Positive,
lakes fan joint trailblazers dot forget play drought,Positive,
reign mother kerr funny none tried make come back drought,Positive,
wonder could help thai market b post reuters drought devastates south europe crops,Negative,""")

with open_strio as fin:
    df = pd.read_csv(open_strio)

dictionary = Counter(['drought', 'water', 'love', 'rain', 'year', 'famine', 'farmers', 'crops', 'south', 'http', 'europe', 'scarcity', 'near', 'thought', 'ever', 'devastates', 'feed', 'message', 'eduaubdedubu', 'instant', 'italy', 'severe', 'by', 'beaches', 'wildfires', 'heat', 'us'])

df['FreqCounter'] = df['Tweets'].apply(lambda x: Counter(x.split()) & dictionary)
df['FreqWord'] = df['FreqCounter'].apply(lambda x: list(x.keys()))





next将遍历一行tweet,检查tweet中是否有任何常用词可用,如果没有则返回Not Found。你知道吗

# sample data frame
df = pd.DataFrame({'name': ['I am going somewhere','tomorrow is holiday']})

# list of frequent words
lst = ['holiday','am']

# check if any word in tweets exist in list of frequent words
df['freq'] = df['name'].map(lambda x: next((y for y in x.split() if y in lst), 'Not Found'))

    name                    freq
0   I am going somewhere    am
1   tomorrow is holiday     holiday

