根据段落类型在pandas数据框列中进行条件更新

0 投票
2 回答
56 浏览
提问于 2025-04-14 15:43

我有一个数据表,长得像这样:

 data = {
    'document_section_id': ['1', '1.1', None, None, None, None, None, None, None, '3', None, None, None, '4.1', None, None, None, None, None, None, None, None, None, None, None, None, None, None, '1.2', None, None, None, None, None, None, None, '1.3', '1.3.1', None, None, '1.3.2', None, None, None, None, '1.3.3', None, None, None, '1.3.4', None, None,None],
    '   ': ['Heading1', 'Heading2', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'TblFootnote', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading2', 'HeadingNoTOC', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'Heading2', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'TblFootnote', 'Heading3', 'Heading3', 'Heading3', 'Heading1']
}

我想做以下几件事:

  1. 获取从一个"Heading1"到下一个"Heading1"之间的所有行,这些行是在"paragraph_type"这一列中的。
  2. 把第一个"Heading1"对应的"SECTION_ID"值存到一个叫做var1的变量里。
  3. 在这个指定的范围内(从Heading1到Heading1),如果任何行的列的值不以var1存的值开头,就把它的"SECTION_ID"替换成空字符串。例如:第10行和第14行的section id不以1开头,所以把它的值更新为''。

这个操作要对所有现有的Heading1都进行。

这是我尝试过的,但没有成功:

modified_dfs = []
for i in range(len(df[df['PARAGRAPH_TYPE'] == 'Heading1']) - 1):
    start_index = df[df['PARAGRAPH_TYPE'] == 'Heading1'].index[i]
    end_index = df[df['PARAGRAPH_TYPE'] == 'Heading1'].index[i+1]
    var1 = df.loc[start_index, 'SECTION_ID']

    section_df = df.reset_index().loc[start_index:end_index-1].copy()

    for j in range(start_index, end_index):
        SECTION_ID = section_df.loc[j, 'SECTION_ID']
        if isinstance(SECTION_ID, str) and not SECTION_ID.startswith(str(var1)):
            section_df.loc[j, 'SECTION_ID'] = ''

    modified_dfs.append(section_df)

result_df = pd.concat(modified_dfs)

期望的输出结果:

    output = {
    'document_section_id': ['1', '1.1', None, None, None, None, None, None, None, '', None, None, None, '', None, None, None, None, None, None, None, None, None, None, None, None, None, None, '1.2', None, None, None, None, None, None, None, '1.3', '1.3.1', None, None, '1.3.2', None, None, None, None, '1.3.3', None, None, None, '1.3.4', None, None, None, None],
    ' ': ['Heading1', 'Heading2', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'TblFootnote', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading2', 'HeadingNoTOC', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'Heading2', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'TblFootnote', 'Heading3', 'Heading3', 'Heading3', 'Heading1']
}

2 个回答

1

你更新后的输出和我最开始用的那个完全不一样。而且你说的那句“你可以忽略输出,基本上3和4.1反正需要替换成空字符串”也调整了算法。

我找到了一个解决你最初问题和数据的方法。虽然这个方法看起来不太好,但确实能用。再加上你的问题和数据每3分钟就会变一次,我就不打算花更多时间去解决了。所以,也许这个方法可以作为你的起点。

def unique_headings():
    unique_list = []
    for val in PARAGRAPH_TYPE:
        if val not in unique_list:
            unique_list.append(val)
    return unique_list


def numeric_vals(_unique_list):
    return {
        val: val.split("Heading")[1]
        for val in _unique_list
        if "Heading" in val and val.split("Heading")[1][0].isnumeric()
    }


def get_idx_of_vals(heading_dict):
    out_dict = heading_dict.copy()

    as_array = np.array(PARAGRAPH_TYPE)
    for key, value in heading_dict.items():
        out_dict[key] = (value, np.where(as_array == key)[0])
    return out_dict

unique_list = unique_headings()
# ['Heading1', 'Heading2', 'HeadingNoTOC', 'TblFootnote', 'ListParagraph', 'Heading3']

num_vals = numeric_vals(unique_list)
# {'Heading1': '1', 'Heading2': '2', 'Heading3': '3'}

with_positions = get_idx_of_vals(num_vals)
# {'Heading1': ('1', array([ 0, 52], dtype=int64)), 'Heading2': ('2', array([ 1, 30, 38], dtype=int64)), 'Heading3': ('3', array([40, 42, 45, 47, 49, 50, 51], dtype=int64))}

def check_and_map(_df, mapper):
    def apply_check(_ser, _val):
        if (_ser["section"] and _ser["section"] != _val and _ser.name not in mapped_ranges) \
                or _ser["section"] is None:
            out_ser[_ser.name] = ""
        else:
            out_ser[_ser.name] = _ser["section"]

    out_df = _df.copy()
    reverse_ordered = dict(sorted(mapper.items(), key=lambda kv: int(kv[1][0]), reverse=True))
    mapped_ranges = []
    out_ser = _df["section"]
    for value_tuple in reverse_ordered.values():
        val, arr = value_tuple
        min_idx, max_idx = arr.min(), arr.max()
        out_df[min_idx:max_idx + 1].apply(
            lambda x: apply_check(x, val),
            axis=1
        )
        mapped_ranges.extend(list(range(min_idx, max_idx + 1)))
    return out_ser


SECTION_ID_DF = pd.DataFrame(dict(section=SECTION_ID))
mapped_ser = check_and_map(SECTION_ID_DF, with_positions)
print(mapped_ser.to_list())
# ['1', '1.1', '', '', '', '', '', '', '', '3', '', '', '', '4.1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1.2', '', '', '', '', '', '', '', '1.3', '1.3.1', '', '', '1.3.2', '', '', '', '', '1.3.3', '', '', '', '1.3.4', '', '']


1

我会使用 packaging.version.Version 这个工具,配合一个反向的 cummin 方法,来找出出问题的版本:

from packaging.version import Version

s = df['document_section_id'].replace('', None).dropna().apply(Version)
df.loc[s.index[s[::-1].cummin().ne(s)], 'document_section_id'] = ''

输出结果:

   document_section_id               
0                    1       Heading1
1                  1.1       Heading2
2                 None   HeadingNoTOC
3                 None   HeadingNoTOC
4                 None   HeadingNoTOC
5                 None   HeadingNoTOC
6                 None   HeadingNoTOC
7                 None    TblFootnote
8                 None   HeadingNoTOC
9                        HeadingNoTOC
10                None   HeadingNoTOC
11                None   HeadingNoTOC
12                None   HeadingNoTOC
13                      ListParagraph
14                None  ListParagraph
15                None  ListParagraph
16                None  ListParagraph
17                None  ListParagraph
18                None  ListParagraph
19                None  ListParagraph
20                None  ListParagraph
21                None  ListParagraph
22                None  ListParagraph
23                None   HeadingNoTOC
24                None   HeadingNoTOC
25                None   HeadingNoTOC
26                None   HeadingNoTOC
27                None   HeadingNoTOC
28                 1.2   HeadingNoTOC
29                None   HeadingNoTOC
30                None       Heading2
31                None   HeadingNoTOC
32                None    TblFootnote
33                None    TblFootnote
34                None    TblFootnote
35                None    TblFootnote
36                 1.3    TblFootnote
37               1.3.1    TblFootnote
38                None       Heading2
39                None   HeadingNoTOC
40               1.3.2       Heading3
41                None   HeadingNoTOC
42                None       Heading3
43                None   HeadingNoTOC
44                None   HeadingNoTOC
45               1.3.3       Heading3
46                None   HeadingNoTOC
47                None       Heading3
48                None    TblFootnote
49               1.3.4       Heading3
50                None       Heading3
51                None       Heading3
52                None       Heading1

中间结果:

        s reversed_cummin     !=
0       1               1  False
1     1.1             1.1  False
9       3             1.2   True
13    4.1             1.2   True
28    1.2             1.2  False
36    1.3             1.3  False
37  1.3.1           1.3.1  False
40  1.3.2           1.3.2  False
45  1.3.3           1.3.3  False
49  1.3.4           1.3.4  False

撰写回答