根据段落类型在pandas数据框列中进行条件更新
我有一个数据表,长得像这样:
data = {
'document_section_id': ['1', '1.1', None, None, None, None, None, None, None, '3', None, None, None, '4.1', None, None, None, None, None, None, None, None, None, None, None, None, None, None, '1.2', None, None, None, None, None, None, None, '1.3', '1.3.1', None, None, '1.3.2', None, None, None, None, '1.3.3', None, None, None, '1.3.4', None, None,None],
' ': ['Heading1', 'Heading2', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'TblFootnote', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading2', 'HeadingNoTOC', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'Heading2', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'TblFootnote', 'Heading3', 'Heading3', 'Heading3', 'Heading1']
}
我想做以下几件事:
- 获取从一个"
Heading1
"到下一个"Heading1
"之间的所有行,这些行是在"paragraph_type
"这一列中的。 - 把第一个"
Heading1
"对应的"SECTION_ID
"值存到一个叫做var1
的变量里。 - 在这个指定的范围内(从Heading1到Heading1),如果任何行的
列的值不以 var1
存的值开头,就把它的"SECTION_ID"替换成空字符串。例如:第10行和第14行的section id不以1开头,所以把它的值更新为''。
这个操作要对所有现有的Heading1都进行。
这是我尝试过的,但没有成功:
modified_dfs = []
for i in range(len(df[df['PARAGRAPH_TYPE'] == 'Heading1']) - 1):
start_index = df[df['PARAGRAPH_TYPE'] == 'Heading1'].index[i]
end_index = df[df['PARAGRAPH_TYPE'] == 'Heading1'].index[i+1]
var1 = df.loc[start_index, 'SECTION_ID']
section_df = df.reset_index().loc[start_index:end_index-1].copy()
for j in range(start_index, end_index):
SECTION_ID = section_df.loc[j, 'SECTION_ID']
if isinstance(SECTION_ID, str) and not SECTION_ID.startswith(str(var1)):
section_df.loc[j, 'SECTION_ID'] = ''
modified_dfs.append(section_df)
result_df = pd.concat(modified_dfs)
期望的输出结果:
output = {
'document_section_id': ['1', '1.1', None, None, None, None, None, None, None, '', None, None, None, '', None, None, None, None, None, None, None, None, None, None, None, None, None, None, '1.2', None, None, None, None, None, None, None, '1.3', '1.3.1', None, None, '1.3.2', None, None, None, None, '1.3.3', None, None, None, '1.3.4', None, None, None, None],
' ': ['Heading1', 'Heading2', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'TblFootnote', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'ListParagraph', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading2', 'HeadingNoTOC', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'TblFootnote', 'Heading2', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'HeadingNoTOC', 'Heading3', 'HeadingNoTOC', 'Heading3', 'TblFootnote', 'Heading3', 'Heading3', 'Heading3', 'Heading1']
}
2 个回答
1
你更新后的输出和我最开始用的那个完全不一样。而且你说的那句“你可以忽略输出,基本上3和4.1反正需要替换成空字符串”也调整了算法。
我找到了一个解决你最初问题和数据的方法。虽然这个方法看起来不太好,但确实能用。再加上你的问题和数据每3分钟就会变一次,我就不打算花更多时间去解决了。所以,也许这个方法可以作为你的起点。
def unique_headings():
unique_list = []
for val in PARAGRAPH_TYPE:
if val not in unique_list:
unique_list.append(val)
return unique_list
def numeric_vals(_unique_list):
return {
val: val.split("Heading")[1]
for val in _unique_list
if "Heading" in val and val.split("Heading")[1][0].isnumeric()
}
def get_idx_of_vals(heading_dict):
out_dict = heading_dict.copy()
as_array = np.array(PARAGRAPH_TYPE)
for key, value in heading_dict.items():
out_dict[key] = (value, np.where(as_array == key)[0])
return out_dict
unique_list = unique_headings()
# ['Heading1', 'Heading2', 'HeadingNoTOC', 'TblFootnote', 'ListParagraph', 'Heading3']
num_vals = numeric_vals(unique_list)
# {'Heading1': '1', 'Heading2': '2', 'Heading3': '3'}
with_positions = get_idx_of_vals(num_vals)
# {'Heading1': ('1', array([ 0, 52], dtype=int64)), 'Heading2': ('2', array([ 1, 30, 38], dtype=int64)), 'Heading3': ('3', array([40, 42, 45, 47, 49, 50, 51], dtype=int64))}
def check_and_map(_df, mapper):
def apply_check(_ser, _val):
if (_ser["section"] and _ser["section"] != _val and _ser.name not in mapped_ranges) \
or _ser["section"] is None:
out_ser[_ser.name] = ""
else:
out_ser[_ser.name] = _ser["section"]
out_df = _df.copy()
reverse_ordered = dict(sorted(mapper.items(), key=lambda kv: int(kv[1][0]), reverse=True))
mapped_ranges = []
out_ser = _df["section"]
for value_tuple in reverse_ordered.values():
val, arr = value_tuple
min_idx, max_idx = arr.min(), arr.max()
out_df[min_idx:max_idx + 1].apply(
lambda x: apply_check(x, val),
axis=1
)
mapped_ranges.extend(list(range(min_idx, max_idx + 1)))
return out_ser
SECTION_ID_DF = pd.DataFrame(dict(section=SECTION_ID))
mapped_ser = check_and_map(SECTION_ID_DF, with_positions)
print(mapped_ser.to_list())
# ['1', '1.1', '', '', '', '', '', '', '', '3', '', '', '', '4.1', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '1.2', '', '', '', '', '', '', '', '1.3', '1.3.1', '', '', '1.3.2', '', '', '', '', '1.3.3', '', '', '', '1.3.4', '', '']
1
我会使用 packaging.version.Version
这个工具,配合一个反向的 cummin
方法,来找出出问题的版本:
from packaging.version import Version
s = df['document_section_id'].replace('', None).dropna().apply(Version)
df.loc[s.index[s[::-1].cummin().ne(s)], 'document_section_id'] = ''
输出结果:
document_section_id
0 1 Heading1
1 1.1 Heading2
2 None HeadingNoTOC
3 None HeadingNoTOC
4 None HeadingNoTOC
5 None HeadingNoTOC
6 None HeadingNoTOC
7 None TblFootnote
8 None HeadingNoTOC
9 HeadingNoTOC
10 None HeadingNoTOC
11 None HeadingNoTOC
12 None HeadingNoTOC
13 ListParagraph
14 None ListParagraph
15 None ListParagraph
16 None ListParagraph
17 None ListParagraph
18 None ListParagraph
19 None ListParagraph
20 None ListParagraph
21 None ListParagraph
22 None ListParagraph
23 None HeadingNoTOC
24 None HeadingNoTOC
25 None HeadingNoTOC
26 None HeadingNoTOC
27 None HeadingNoTOC
28 1.2 HeadingNoTOC
29 None HeadingNoTOC
30 None Heading2
31 None HeadingNoTOC
32 None TblFootnote
33 None TblFootnote
34 None TblFootnote
35 None TblFootnote
36 1.3 TblFootnote
37 1.3.1 TblFootnote
38 None Heading2
39 None HeadingNoTOC
40 1.3.2 Heading3
41 None HeadingNoTOC
42 None Heading3
43 None HeadingNoTOC
44 None HeadingNoTOC
45 1.3.3 Heading3
46 None HeadingNoTOC
47 None Heading3
48 None TblFootnote
49 1.3.4 Heading3
50 None Heading3
51 None Heading3
52 None Heading1
中间结果:
s reversed_cummin !=
0 1 1 False
1 1.1 1.1 False
9 3 1.2 True
13 4.1 1.2 True
28 1.2 1.2 False
36 1.3 1.3 False
37 1.3.1 1.3.1 False
40 1.3.2 1.3.2 False
45 1.3.3 1.3.3 False
49 1.3.4 1.3.4 False