从pandas数据帧中用多个条件匹配、替换和提取子字符串的最快方法是什么？

import numpy as np import pandas as pd test_data = {'panel_judges' : ['CHAGARES, VANASKIE, SCHWARTZ', 'Sidney R. Thomas, Barry G. Silverman, Raymond C. Fisher, Opinion by Thomas'], 'court_num' : [3, 9], 'date_year' : [2014, 2014]} test_df = pd.DataFrame(data = test_data) name_dict = {'full_name' : ['Chagares, Michael A.', 'Vanaskie, Thomas Ignatius', 'Schwartz, Charles, Jr.', 'Schwartz, Edward Joseph', 'Schwartz, Milton Lewis', 'Schwartz, Murray Merle'], 'court_num' : [3, 3, 1061, 1097, 1058, 1013], 'circuit_num' : [3, 3, 5, 9, 9, 3], 'start_year' : [2006, 2010, 1976, 1968, 1979, 1974], 'end_year' : [2016, 2019, 2012, 2000, 2005, 2013], 'hq_match' : ['M(?=ICHAEL)? ?A?(?=\.)? ?CHAGARES', 'T(?=HOMAS)? ?I?(?=GNATIUS)? ?VANASKIE', 'C(?=HARLES)? SCHWARTZ', 'E(?=DWARD)? ?J?(?=OSEPH)? ?SCHWARTZ', 'M(?=ILTON)? ?L?(?=EWIS)? ?SCHWARTZ', 'M(?=URRAY)? ?M?(?=ERLE)? ?SCHWARTZ'], 'lq_match' : ['CHAGARES', 'VANASKIE', 'SCHWARTZ', 'SCHWARTZ', 'SCHWARTZ', 'SCHWARTZ']} names = pd.DataFrame(data = name_dict) in_col = 'panel_judges' year_col = 'date_year' out_col = 'fixed_panel' court_num_col = 'court_num' test_df[out_col] = '' test_df[out_col].astype(list, inplace = True) def judge_matcher(df, in_col, out_col, year_col, court_num_col, size_column = None): general_cols = ['start_year', 'end_year', 'full_name'] court_cols = ['court_num', 'circuit_num'] match_cols = ['hq_match', 'lq_match'] for match_col in match_cols: for court_col in court_cols: lookup_cols = general_cols + [court_col] + [match_col] judge_df = names[lookup_cols] for year in range(df[year_col].min(), df[year_col].max() + 1): for court in range(df[court_num_col].min(), df[court_num_col].max() + 1): lookup_subset = ((judge_df['start_year'] <= year) & (year < (judge_df['end_year'] + 2)) & (judge_df[court_col] == court)) new_names = names.loc[lookup_subset] df_subset = ((df[year_col] == year) & (df[court_num_col] == court)) df.loc[df_subset] = matcher(df.loc[df_subset], in_col, out_col, new_names, match_col) return df def matcher(df, in_col, out_col, lookup, keys): patterns = dict(zip(lookup[keys], lookup['full_name'])) for key, value in patterns.items(): df[out_col] = ( np.where(df[in_col].astype(str).str.upper().str.contains(key), df[out_col] + value + ', ', df[out_col])) df[in_col] = df[in_col].astype(str).str.upper().str.replace(key, '') return df df = judge_matcher(test_df, in_col, out_col, year_col, court_num_col)

1条回答

网友

1楼 · 发布于 2024-05-18 14:44:27

对于任何偶然发现这个问题并在pandas中有类似复杂字符串匹配问题的人来说，这是我发现最快的解决方案。在

它不像我想要的那样完全矢量化，但是我用了数据框应用在类中使用此方法：

def judge_matcher(self, row, in_col, out_col, year_col, court_num_col, 
                  size_col = None):
    final_list = []
    raw_list = row[in_col]
    cleaned_list = [x for x in raw_list if x]
    cleaned_list = [x.strip() for x in cleaned_list]
    for name in cleaned_list:
        name1 = self.convert_judge_name(row[year_col],
                                        row[court_num_col], name, 1)
        name2 = self.convert_judge_name(row[year_col],
                                        row[court_num_col], name, 2)
        if name1 in self.names_dict_list[0]:
            final_list.append(self.names_dict_list[0].get(name1))
        elif name1 in self.names_dict_list[1]:
            final_list.append(self.names_dict_list[1].get(name1))
        elif name2 in self.names_dict_list[2]:
            final_list.append(self.names_dict_list[2].get(name2))
        elif name2 in self.names_dict_list[3]:
            final_list.append(self.names_dict_list[3].get(name2))
        elif name in self.names_dict_list[4]:
            final_list.append(self.names_dict_list[4].get(name)) 
    final_list = list(unique_everseen(final_list))
    final_list.sort()
    row[out_col] = final_list
    if size_col and final_list:
        row[size_col] = len(final_list)
    return row 

@staticmethod
def convert_judge_name(year, court, name, dict_type):
    if dict_type == 1:
        return str(int(court) * 10000 + int(year)) + name
    elif dict_type == 2:
        return str(int(year)) + name
    else:
        return name

基本上，它将三列连接在一起，并使用连接的字符串执行哈希字典查找（而不是正则表达式）。乘法被用来有效地连接两个数字并排作为字符串。字典也有类似的准备键（值是所需的字符串）。通过使用列表和重复数据消除，我不必删除匹配的字符串。我没有为这个特定的函数计时，但是整个模块只花了10多个小时就处理了大约100万行。当我再次运行它时，我会尽量记住这个应用函数的具体时间，并将结果张贴在这里。这个方法很难看，但相当有效。在

相关问题更多 >

编程相关推荐

热门问题

热门文章