正则表达式忽略html标签

1条回答

网友

1楼 · 发布于 2024-04-25 03:56:06

好吧，我自己想出了一个解决办法：

import re 

test_html = r'font></font><font face="Tahoma"><font size="4"> alleging that </font></font><font face="Tahoma"><font size="4">soldiers of the Uganda Peoples <span class="scayt-misspell" data-scayt_word="Defence" data-scaytid="32">Defence</span>'


NOT_TAG_REGEX = re.compile(r'(?<=\>)[^<>]+(?=\<)')


def create_chunks(html: str = None):
    """
    Divides an html string into the
    text chunks between tags, while
    storing the start and end indexes of the text
    in both the origional html string, and in the string
    that will be formed by concatenating the text in
    all the chunks.
    """
    matches = NOT_TAG_REGEX.finditer(html)

    text_cursor = 0
    chunks = []
    for match in matches:
        chunk = {
            "text": match.group(),
            "html_start": match.start(),
            "html_end": match.end(),
            "txt_start": text_cursor
        }
        text_cursor += match.end() - match.start()
        chunk["txt_end"] = text_cursor
        chunks.append(chunk)
    return chunks


def to_html_indx(txt_indx, chunks):
    """
    Given the index of a regex match in a string formed from 
    html, returns the index of that same string in the 
    origional html document
    """
    for chunk in chunks:
        if chunk["txt_start"] <= txt_indx <= chunk["txt_end"]:
            txt_indx_in_txt_chunk = txt_indx - chunk["txt_start"]
            html_indx = txt_indx_in_txt_chunk + chunk["html_start"]
            return html_indx
    else:
        print("some error message")
        return None


def main():
    chunks = create_chunks(test_html)
    text = "".join(chunk["text"] for chunk in chunks)
    print(text)
    example_regex = re.compile(r'that soldiers of')
    matches = example_regex.finditer(text)

    for match in matches:
        print("text match: " + match.group())
        txt_start = match.start()
        txt_end = match.end()
        html_start = to_html_indx(txt_start, chunks)
        html_end = to_html_indx(txt_end, chunks)
        print("html match: " + test_html[html_start: html_end])

if __name__ == "__main__":
    main()

这将产生：

text match: that soldiers of

html match: that </font></font><font face="Tahoma"><font size="4">soldiers of

相关问题更多 >

编程相关推荐

热门问题

热门文章

正则表达式忽略html标签

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >