在Python中使用序列匹配器查找最长公共字符串

2024-06-16 11:24:09 发布

您现在位置:Python中文网/ 问答频道 /正文

我试图在Python中使用difflib.SequenceMatcher来返回最大的公共字符串

string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print match
print(string1[match.a: match.a + match.size])

string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44,"""
string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44"""
match = SequenceMatcher(None, string1, string2).find_longest_match(0,    len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size])

所以基本上在比较string1string2[前两行]时返回CRITICAL ha_test_util.py:44, 当我从string1string2[第6和7行]中剪切一些行时,它返回ERROR agave_util.py:64 Timed out waiting for

基本上我的问题是为什么序列匹配器在我的第一个案例中没有返回正确的匹配?在


Tags: inpytesthostformainutilline
1条回答
网友
1楼 · 发布于 2024-06-16 11:24:09

你正经历着SequenceMatcher的自动垃圾启发的影响(在你的例子中)。从docs

Automatic junk heuristic: SequenceMatcher supports a heuristic that automatically treats certain sequence items as junk. The heuristic counts how many times each individual item appears in the sequence. If an item’s duplicates (after the first one) account for more than 1% of the sequence and the sequence is at least 200 items long, this item is marked as “popular” and is treated as junk for the purpose of sequence matching. This heuristic can be turned off by setting the autojunk argument to False when creating the SequenceMatcher.

SequenceMatcher构造函数中,autojunk默认为True。如果您尝试使用autojunk=False,您将得到预期的最长匹配:

from difflib import SequenceMatcher

string1 = """ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""
string2 = """ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

match = SequenceMatcher(None, string1, string2, autojunk=False).find_longest_match(0, len(string1), 0, len(string2))
print(match)

输出:

^{pr2}$

当然,我们可以检查所有匹配块并找到最长的:

>>> max(SequenceMatcher(None, string1, string2, autojunk=False).get_matching_blocks(),
...     key=lambda m: m.size)
Match(a=110, b=156, size=534)

为了在一个简单的例子中说明autojunking的效果,让我们看看这里发生了什么:

>>> a = "aa:bb:cc" + ":"*200
>>> b = "aa:bb" + ":"*200
>>> SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
Match(a=0, b=0, size=6)     # : is classified as junk
>>> SequenceMatcher(None, a, b, autojunk=False).find_longest_match(0, len(a), 0, len(b))
Match(a=8, b=5, size=200)   # : is NOT classified as junk

在第一种情况下(默认情况下autojunk=True:被认为是垃圾字符(它代表序列的1%以上,至少有200个条目),因此,"looks right to people"的最长匹配只有6个字符(前6个字符)。在

在第二种情况下(使用显式的autojunk=False),垃圾启发式是关闭的,因此最长的匹配是最后200个字符。在

如果您对较短的序列(少于200个字符)重复相同的测试,您可以看到autojunk没有任何区别,因为垃圾启发式是关闭的(请参见source)。在

相关问题 更多 >