Python中NLP中ngram模式在数据帧中的应用

for (w1, tag1), (w2, tag2), (w3, tag3) in nltk.trigrams(PoS_TAGS): if tag1.startswith("RB") and tag2.startswith("RB") and tag3.startswith("JJ"): tri_pairs.append((w1, w2, w3)) if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in D: print("[True]: Tri Pairs are found in Drought Rel. Term") for j in range(len(F)): if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in F[j]: print("[True]: Tri Pairs are found in Frequent Wordset") if RES is "Positive": RES = "Highly Positive" elif RES is "Negative": RES = "Highly Negative" print "="*25,F[j] FW_list.append(F[j]) else: print"[False]: Doesn't Match with Frequent Wordset\n" else: print"[False]: Tri Pairs Matched Nowhere in D\n" else: print "[TriPair(F)]: Pattern for Adverb, Adverb, Adjective did not match.\n Looking for Bi-Pair Patterns\n" print(tri_pairs) print(">"*13,FW)

编辑1：

我已经可以经常用词了。具体如下：

>>> F ['drought', 'water', 'love', 'rain', 'year', 'famine', 'farmers', 'crops', 'south', 'http', 'europe', 'scarcity', 'near', 'thought', 'ever', 'devastates', 'feed', 'message', 'eduaubdedubu', 'instant', 'italy', 'severe', 'by', 'beaches', 'wildfires', 'heat', 'us']

编辑2

CSV如下所示：

Tweets,Classified,FreqWord real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Negative, calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive, love thought drought,Positive, neville rooney end ever tons trophy drought,Positive, lakes drought,Positive, lakes fan joint trailblazers dot forget play drought,Positive, reign mother kerr funny none tried make come back drought,Positive, wonder could help thai market b post reuters drought devastates south europe crops,Negative,

输入文件：

tweets,polarity real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Positive calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive hate thought drought,Negative

尽管如此，我上面显示的输出是标记化的，停止字也被删除了。你知道吗

预期输出文件：

Tweets,Classified,FreqWord real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Negative,water calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive,drought love thought drought,Positive,drought neville rooney end ever tons trophy drought,Positive,rain lakes drought,Positive,drought lakes fan joint trailblazers dot forget play drought,Positive,farmer reign mother kerr funny none tried make come back drought,Positive,crops wonder could help thai market b post reuters drought devastates south europe crops,Negative,crops

编辑3

FW = '' for i in range(len(tweets)): sent = nltk.word_tokenize(tweets[i]) PoS_TAGS = nltk.pos_tag(sent) from nltk.sentiment.vader import SentimentIntensityAnalyzer sia = SentimentIntensityAnalyzer() one_sentence = tweets.iloc[i] scores = sia.polarity_scores(text=one_sentence) print "POS:", scores.get('pos') print "NEG:", scores.get('neg') print "NEU:", scores.get('neu') POS = scores.get('pos') NEG = scores.get('neg') NEU = scores.get('neu') RES = str() if POS > NEG: RES = 'Positive' elif NEG > POS: RES = 'Negative' elif NEU >= 0.5 or POS > NEU: RES = 'Positive' elif NEU < 0.5: RES = 'Negative' # -------------------------------------------------------- PATTERN ADVERB, ADVERB, ADJECTIVE (Down) tri_pairs = list() for (w1, tag1), (w2, tag2), (w3, tag3) in nltk.trigrams(PoS_TAGS): if tag1.startswith("RB") and tag2.startswith("RB") and tag3.startswith("JJ"): tri_pairs.append((w1, w2, w3)) if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in D: print("[True]: Tri Pairs are found in Drought Rel. Term") # TRIGGER AREA for j in range(len(F)): if tri_pairs[0] or tri_pairs[1] or tri_pairs[2] in F[j]: print("[True]: Tri Pairs are found in Frequent Wordset") if RES is "Positive": RES = "Highly Positive" FW = F[j] #fuzzy_df['FreqWord'].map(lambda x: next((y for y in x.split() if y in F), 'Not Found')) elif RES is "Negative": RES = "Highly Negative" FW = F[j] else: print"[False]: Doesn't Match with Frequent Wordset\n" else: print"[False]: Tri Pairs Matched Nowhere in D\n" else: print "[TriPair(F)]: Pattern for Adverb, Adverb, Adjective did not match.\n Looking for Bi-Pair Patterns\n" print(tri_pairs) # -------------------------------------------------------- PATTERN ADVERB, ADJECTIVE (Down) bi_pairs = list() for (w1, tag1), (w2, tag2) in nltk.bigrams(PoS_TAGS): if tag1.startswith("RB") and tag2.startswith("JJ"): bi_pairs.append((w1, w2)) if bi_pairs[0] or bi_pairs[1] in D: print("[True]: Bi Pairs are found in Drought Rel. Term") for k in range(len(F)): if bi_pairs[0] or bi_pairs[1] is F[k]: print("[True]: Bi Pairs are found in Frequent Wordset") if RES is "Positive": RES = "Moderately Positive" FW = F[k] elif RES is "Negative": RES = "Moderately Negative" FW = F[k] else: print("[False]: Bi Pairs found missing in Freq. Wordset") else: print("[False]: Bi Pairs Matched Nowhere in D") else: print("[BiPair(F)]: Pattern Not Matched, Looking for Mono Pattern") print(bi_pairs) # -------------------------------------------------------- PATTERN ADJECTIVE (Down) for w, tag in PoS_TAGS: print w, " - ", tag if tag.startswith("JJ"): if w in D: print("Matched with D") for l in range(len(F)): if w is F[l]: print("Matched with F") if RES is "Positive": RES = "Positive" FW = F[l] elif RES is "Negative": RES = "Negative" FW = F[l] else: print("Unmatched in F") FW = F[l] in sent else: print("Unmatched in D") else: print w, "is not an ADJECTIVE" # -------------------------------------------------------- MAKING ENTRY OF RECORDS OF TWEETS and POLARITY RESULT fuzzy_df = fuzzy_df.append({'Tweets': tweets[i], 'Classified': RES, 'FreqWord': FW}, ignore_index=True) # ADDING RECORDS IN DATAFRAME fuzzy_df.to_csv("fuzzy.csv", index=False)

2条回答

网友

1楼 · 编辑于 2024-04-19 06:14:43

你想这么做吗？你知道吗

import io
import pandas as pd
from collections import Counter

open_strio = io.StringIO("""Tweets,Classified,FreqWord
real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Negative,
calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive,
love thought drought,Positive,
neville rooney end ever tons trophy drought,Positive,
lakes drought,Positive,
lakes fan joint trailblazers dot forget play drought,Positive,
reign mother kerr funny none tried make come back drought,Positive,
wonder could help thai market b post reuters drought devastates south europe crops,Negative,""")

with open_strio as fin:
    df = pd.read_csv(open_strio)


dictionary = Counter(['drought', 'water', 'love', 'rain', 'year', 'famine', 'farmers', 'crops', 'south', 'http', 'europe', 'scarcity', 'near', 'thought', 'ever', 'devastates', 'feed', 'message', 'eduaubdedubu', 'instant', 'italy', 'severe', 'by', 'beaches', 'wildfires', 'heat', 'us'])

df['FreqCounter'] = df['Tweets'].apply(lambda x: Counter(x.split()) & dictionary)
df['FreqWord'] = df['FreqCounter'].apply(lambda x: list(x.keys()))

首先从定义的单词列表中创建一个简单的Counter()对象（即dictionary）。你知道吗

然后对tweet的每一行应用Counter()交集来创建df['FreqCounter']列。你知道吗

最后，从df['FreqCounter']中提取唯一键集以填充df['FreqWord']

如果您不需要每行tweet的dictionary中单词的计数器，您可以简单地使用一个集合，即

import io
import pandas as pd

open_strio = io.StringIO("""Tweets,Classified,FreqWord
real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Negative,
calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive,
love thought drought,Positive,
neville rooney end ever tons trophy drought,Positive,
lakes drought,Positive,
lakes fan joint trailblazers dot forget play drought,Positive,
reign mother kerr funny none tried make come back drought,Positive,
wonder could help thai market b post reuters drought devastates south europe crops,Negative,""")

with open_strio as fin:
    df = pd.read_csv(open_strio)


dictionary = set(['drought', 'water', 'love', 'rain', 'year', 'famine', 'farmers', 'crops', 'south', 'http', 'europe', 'scarcity', 'near', 'thought', 'ever', 'devastates', 'feed', 'message', 'eduaubdedubu', 'instant', 'italy', 'severe', 'by', 'beaches', 'wildfires', 'heat', 'us'])

df['FreqWord'] = df['Tweets'].apply(lambda x: set(x.split()) & dictionary)

如果您想从df['FreqCounter']中找出最常用的单词，那么：

import io
import pandas as pd
from collections import Counter

open_strio = io.StringIO("""Tweets,Classified,FreqWord
real time strategy password wastelands depletion groundwater skyrocketing debts make years anantapur drought worse,Negative,
calm director day science meetings nasal talk cutting edge remote sensing research drought veg fluorescence calm love,Positive,
love thought drought,Positive,
neville rooney end ever tons trophy drought,Positive,
lakes drought,Positive,
lakes fan joint trailblazers dot forget play drought,Positive,
reign mother kerr funny none tried make come back drought,Positive,
wonder could help thai market b post reuters drought devastates south europe crops,Negative,""")

with open_strio as fin:
    df = pd.read_csv(open_strio)


dictionary = Counter(['drought', 'water', 'love', 'rain', 'year', 'famine', 'farmers', 'crops', 'south', 'http', 'europe', 'scarcity', 'near', 'thought', 'ever', 'devastates', 'feed', 'message', 'eduaubdedubu', 'instant', 'italy', 'severe', 'by', 'beaches', 'wildfires', 'heat', 'us'])

df['FreqCounter'] = df['Tweets'].apply(lambda x: Counter(x.split()) & dictionary)
df['FreqWord'] = df['FreqCounter'].apply(lambda x: x.most_common()[0][0])

网友

2楼 · 编辑于 2024-04-19 06:14:43

使用这个最小的示例，您也可以尝试以下简单的方法：
next将遍历一行tweet，检查tweet中是否有任何常用词可用，如果没有则返回Not Found。你知道吗

# sample data frame
df = pd.DataFrame({'name': ['I am going somewhere','tomorrow is holiday']})

# list of frequent words
lst = ['holiday','am']

# check if any word in tweets exist in list of frequent words
df['freq'] = df['name'].map(lambda x: next((y for y in x.split() if y in lst), 'Not Found'))

print(df)
    name                    freq
0   I am going somewhere    am
1   tomorrow is holiday     holiday

编辑1：

编辑2

编辑3

相关问题更多 >

编程相关推荐

热门问题

热门文章