document = Document(<YOUR_DOC>)
# Data will be a list of rows represented as dictionaries
# containing each row's data.
characters = {}
for paragraph in <YOUR_PARAGRAPHS>:
run_string = ""
run_index = {}
i = 0
for x, run in enumerate(paragraph.runs):
# Create a string consisting of all the runs' text. Theoretically this
# should always be the same as parapgrah.text, but I didn't check
run_string = run_string + run.text
# The index i represents the starting position of the run in question
# within the string. We are creating a dictionary of form
# {<run_start_location>: <pointer_to_run>}
run_index[i] = x
# This will be the start of the next run
i = i + len(run.text)
word_you_wanted_to_find = re.findall("some_regex", paragraph.text)
for word in word_you_wanted_to_find:
# [m.start() for m in re.finditer(word, run_string)] returns the starting
# positions of each word that was found
for word_start in [m.start() for m in re.finditer(word, run_string)]:
word_end = word_start + len(word)
# This will be a list of the indices of the runs which have part
# of the word we want to include
included_runs = []
for key in run_index.keys():
# Remember, the key is the location in the string of the start of
# the run. In this case, the start of the word start should be less than
# the key+len(run) and the end of the word should be greater
# than the key (the start of the run)
if word_start <= (key + len(paragraph.runs[run_index[key]].text)) and key < word_end:
included_runs.append(key)
# If the key is larger than or equal to the end of the word,
# this means we have found all relevant keys. We don't need
# to loop over the rest (we could, it just wouldn't be efficient)
if key >= word_end:
break
# At this point, included_runs is a full list of indices to the relevant
# runs so we can modify each one in turn.
for run_key in included_runs:
paragraph.runs[run_index[run_key]].italic = True
document.save(<MODIFIED_DOC>)
我将不讨论这个问题,因为我提出的方法并不完美,但它在绝大多数情况下都有效。代码如下:
问题1
这种方法的问题是,虽然不常见(至少在我的文档中是这样),但一次运行可能包含的不仅仅是目标单词。因此,你可能最终会将整个跑步记录(包括你的跑步记录和一些跑步记录)斜体化。对于我的用例来说,在这里解决这个问题是没有意义的
解决方案
如果要完善我在上面所做的工作,您必须更改此代码块:
在这里,您已经确定了符合您承诺的跑步。您需要扩展代码,将单词分隔到自己的运行中,并将其从当前运行中删除。然后你可以分别用斜体字标出那一段
问题2
上面显示的代码不能同时处理表格和普通文本。对于我的用例,我不需要这样做,但在一般情况下,您必须同时检查这两个选项
相关问题 更多 >
编程相关推荐