查找从一个csv列到另一个csv列的单词频率

import csv import pandas as pd from collections import Counter import re import operator #Bacteria File Open Bac = [] with open ("/home/shayez/Desktop/Bacteria.csv", "r") as csv_file1: csv_reader1 = csv.reader(csv_file1,delimiter = ',') for lines1 in csv_reader1: Bac.append(lines1) # print(lines1[0]) #Abstract File Open Abs = [] with open ("/home/shayez/Desktop/Anti.csv", "r") as csv_file: csv_reader = csv.reader(csv_file,delimiter = ',') for lines in csv_reader: Abs.append(lines[2]) abswordlist = [] for ab in Abs: abswordlist.append(Counter(ab.split())) #print (abswordlist) cntword = Counter(Abs) for Bac in Bac: print (f"{Bac}:{abswordlist[Bac]}")

1条回答

网友

1楼 · 发布于 2024-05-23 23:29:43

我建议您使用pandas库来完成这项任务，因为您似乎有很多聚合要做。你知道吗

既然你不提供一个[mcve]，我只好自己做一个。因此，您必须读取您的第一个csv，并将值作为列表保留。它们稍后将成为您将保留的列。你知道吗

然后。。。使用此数组。我建议您使用^{}，与split()和^{}（来自python集合）结合使用。然后，^{}所有这些都使用^{}。你知道吗

import pandas as pd

from collections import Counter
from pandas.io.json import json_normalize

to_keep = ['LONER', 'I', 'AM']

df = pd.DataFrame({
        'date' : ['some date', 'some_other_date', 'lol date'],
        'garbage' : ['I AM A LONER', 'AND SO AM I LOL', 'some other weird sentence']
    })
print(df.head())
#               date                    garbage
# 0        some date               I AM A LONER
# 1  some_other_date            AND SO AM I LOL
# 2         lol date  some other weird sentence

# Here I am showing you the inside of what I insert into json_normalize.
# It basically counts the word occurrences per line. You split the words,    
# and count the list items using `Counter()`
print(df['garbage'].apply(lambda x:Counter(x.split())))
# 0                {'I': 1, 'AM': 1, 'A': 1, 'LONER': 1}
# 1       {'AND': 1, 'SO': 1, 'AM': 1, 'I': 1, 'LOL': 1}
# 2    {'some': 1, 'other': 1, 'weird': 1, 'sentence'...

# Then, you use the json_normalize() function to turn all your jsons into a big DataFrame. And join the result to the previously created DataFrame.
df = df.join( json_normalize(df['garbage'].apply(lambda x:Counter(x.split()))) )
print(df)
#               date                    garbage    A  ...    sentence  some  weird
# 0        some date               I AM A LONER  1.0  ...         NaN   NaN    NaN
# 1  some_other_date            AND SO AM I LOL  NaN  ...         NaN   NaN    NaN
# 2         lol date  some other weird sentence  NaN  ...         1.0   1.0    1.0

# And keep the first indices, here, only date, in addition of the columns you wished to keep earlier.
final_df = df[ ['date'] + [*to_keep] ]
print(final_df)
#               date  LONER    I   AM
# 0        some date    1.0  1.0  1.0
# 1  some_other_date    NaN  1.0  1.0
# 2         lol date    NaN  NaN  NaN

相关问题更多 >

编程相关推荐

热门问题

热门文章