使用merge和groupby将DF引入新方案

DF_OLD = ... sID tID NER token Prediction 274 79 U-Peop khrushchev Live_In-ARG2+B 274 79 O 's Live_IN-ARG2+L 807 53 U-Loc louisiana Live_IN-ARG2+U 807 56 B-Peop earl Live_IN-ARG1+B 807 57 L-Peop long Live_IN-ARG1+L 807 13 B-Peop dwight Live_IN-ARG1+B 807 13 I-Peop d. Live_IN-ARG1+I 807 13 L-Peop eisenhower Live_IN-ARG1+L ...

DF_Expected = ... sID entity1 tID1 entity2 tID2 Relation 274 NaN NaN khrushchev 's 79 Live_In 807 earl long 56 57 louisiana 53 Live_In 807 dwight d. eisenhower 13 louisiana 53 Live_In ...

DF["Live_In_Predict_Split"] = DF["Prediction"].str.split("+").str[0] DF["token2"] = DF["token"] DF["tokenID2"] = DF["tokenID"] DF["Live_In_Predict2"] = DF["Live_In_Predict"] data_tokeni_map = DF.groupby(["Live_In_Predict_Split","sentenceID"],as_index=True, sort=False).agg(" ".join).reset_index() s = data_tokeni_map.loc[:,['sentenceID','token2',"tokenID2","Live_In_Predict2"]].merge(data_tokeni_map.loc[:,['sentenceID','token',"tokenID","Live_In_Predict"]],on='sentenceID') s = s.loc[s.token2!=s.token].drop_duplicates()

DF_EDITED = ... sID entity1 tID1 entity2 tID2 ... 807 dwight d eisenhower earl long 13 56 57 louisiana 53 807 louisiana 13 56 57 dwight d eisenhower earl long 53

DF_OLD_Edit = ... sID tID NER token Prediction 274 79 U-Peop khrushchev Live_In-ARG2+B_1 274 79 O 's Live_IN-ARG2+L_1 807 53 U-Loc louisiana Live_IN-ARG2+U_1 807 56 B-Peop earl Live_IN-ARG1+B_1 807 57 L-Peop long Live_IN-ARG1+L_1 807 13 B-Peop dwight Live_IN-ARG1+B_2 807 13 I-Peop d. Live_IN-ARG1+I_2 807 13 L-Peop eisenhower Live_IN-ARG1+L_2 ...

2条回答

网友

1楼 · 编辑于 2024-04-18 11:15:41

必须混合使用函数和DF操作。这些方法一点效率都没有，但确实有效。你知道吗

def combine(some_list):
current_group = 0 
g_size = 0 
for elem in some_list:
    g_size += 1
    if elem.endswith('U'): 
        if g_size > 1:  
            g_size = 1 
            current_group += 1 
    yield '{}{}'.format(current_group, elem)
    if elem.endswith(('L', 'U')):
        g_size = 0
        current_group += 1

def splitter(DF):
return re.findall('^\d[\d]?[\d]?', DF)  

# Not very efficient
DF["entity2"] = DF["entity"]    
DF["tID2"] = DF["tID"]
DF["Prediction2"] = DF["Prediction"]
DF["Pred_Group"] = list(combine(DF["Prediction"].tolist()))
DF["Jojo"] = DF["Pred_Group"].apply(splitter)
DF["Jojo"] = DF["Jojo"].astype(str).apply(ast.literal_eval).apply(lambda x: " ".join(x))
dmap = DF.groupby(["Jojo","sID"],as_index=True, sort=False).agg(" ".join).reset_index()
s = dmap.loc[:,['sID','entity2',"tID2","Prediction2"]].merge(dmap.loc[:,['sID','entity',"tID","Prediction"]],on='sID')                                             
s = s.loc[s.entity2!=s.entity].drop_duplicates()    
s = s[s["Prediction"].str.contains(r"-ARG2+")]
DF= s[s["Prediction2"].str.contains(r"-ARG1+")]

网友

2楼 · 编辑于 2024-04-18 11:15:41

数据：

df

   sID  tID     NER       token        Prediction
0  274   79  U-Peop  khrushchev  Live_IN-ARG2+B_1
1  274   79       O          's  Live_IN-ARG2+L_1
2  807   53   U-Loc   louisiana  Live_IN-ARG2+U_1
3  807   56  B-Peop        earl  Live_IN-ARG1+B_1
4  807   57  L-Peop        long  Live_IN-ARG1+L_1
5  807   13  B-Peop      dwight  Live_IN-ARG1+B_2
6  807   13  I-Peop          d.  Live_IN-ARG1+I_2
7  807   13  L-Peop  eisenhower  Live_IN-ARG1+L_2

代码：

import numpy as np
import pandas as pd
import typing

# setting up some columns for groupby
df['arg'] = df.Prediction.apply(lambda x: x.split("_")[1].split("-")[1].split("+")[0])
df['Relation'] = df.Prediction.apply(lambda x: x.split("-")[0])
df['ingroup_id'] = df.Prediction.apply(lambda x: x.split("_")[-1])

# groupby and collect relevant tID and token
df1 = df.groupby(['sID', 'arg', 'ingroup_id']).tID.apply(list)
df2 = df.groupby(['sID', 'arg', 'ingroup_id']).token.apply(list)
df3 = pd.concat([df1, df2], axis=1).reset_index()
df3.tID = df3.tID.apply(lambda x: list(set(x)))

# setting up columns that we finally use
df3.loc[df3.arg == 'ARG1', 'tID1'] = df3.tID
df3.loc[df3.arg == 'ARG2', 'tID2'] = df3.tID
df3.loc[df3.arg == 'ARG1', 'entity1'] = df3.token
df3.loc[df3.arg == 'ARG2', 'entity2'] = df3.token

# sort values and then ffill/bfill within the group
df3 = df3.sort_values(['sID', 'arg']).reset_index(drop=True)
df3.tID1 = df3.groupby(['sID']).tID1.ffill()
df3.entity1 = df3.groupby(['sID']).entity1.ffill()
df3.tID2 = df3.groupby(['sID']).tID2.bfill()
df3.entity2 = df3.groupby(['sID']).entity2.bfill()
df3 = df3[['sID', 'entity1', 'tID1', 'entity2', 'tID2']].set_index('sID')

# converting lists in cells into strings (may be someone can make this as a one liner)
df3.entity1 = df3.entity1.apply(lambda x: ' '.join(x) if isinstance(x, typing.List) else np.nan)
df3.entity2 = df3.entity2.apply(lambda x: ' '.join(x) if isinstance(x, typing.List) else np.nan)
df3.tID1 = df3.tID1.apply(lambda x: ' '.join(str(y) for y in x) if isinstance(x, typing.List) else np.nan)
df3.tID2 = df3.tID2.apply(lambda x: ' '.join(str(y) for y in x) if isinstance(x, typing.List) else np.nan)
df3 = df3.drop_duplicates().reset_index()

df3 = df3.merge(df[['sID', 'Relation']].drop_duplicates(), on='sID', how='left')

输出：

   sID               entity1   tID1        entity2 tID2 Relation
0  274                   NaN    NaN  khrushchev 's   79  Live_IN
1  807             earl long  56 57      louisiana   53  Live_IN
2  807  dwight d. eisenhower     13      louisiana   53  Live_IN

由于缺乏技巧，代码很长，但基本上它所做的是groupby和merge，正如您在标题中所建议的那样。希望这有帮助。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章