当使用wikipedia的API的.random()函数时,为什么页面越多,重复页面的数量就越大?

2024-04-26 12:15:42 发布

您现在位置:Python中文网/ 问答频道 /正文

我正在使用维基百科的API来搜集挪威文的内容,将其清理并写入一个文件中,以便为CMU Sphinx培训一个语言模型。你知道吗

在for循环中运行.random函数时,我遇到了一个问题。我正在通过pageId计算唯一页面的数量,我得到了大量的副本。一开始并不太多,但过了一段时间,重复的数量是唯一ID数量的两倍。当我们有40页时,我们有大约80个重复。你知道吗

随机函数肯定有什么我们没有看到的?你知道吗

这是密码。正则表达式是用于更容易读取过滤器顺序的函数。你知道吗

import re

import wikipedia

"""
Module wikitest.py - Script for scraping Wikipedia of text based on articles 
found by using wikipedia.random.
    Used for gathering and formatting written text representative of the 
Norwegian language,
    for use in training language models.
"""

# Create regex to filter the results
specialcharreg = re.compile(r'[^A-Za-zÆØÅæøå0-9.,-]+', re.IGNORECASE)
whitespacereg = re.compile(r' {2}', re.IGNORECASE)
punctuationreg = re.compile(r'[.]+', re.IGNORECASE)
shortsentencereg = re.compile(r'(</?s>)([a-zæøåA-ZÆØÅ0-9,\- ]{0,50})(</? 
s>)', re.IGNORECASE)
isbnreg = re.compile(r'(ISBN)([0-9- ]{7,21})', re.IGNORECASE)
nospaceaftertagreg = re.compile(r'(<s>([a-zæøåA-ZÆØÅ,-]))', re.IGNORECASE)


#  filter-methods for formatting the text
def nospeacialchar(wikicontent): return re.sub(specialcharreg, ' ', 
wikicontent)


def nodoublewhitespace(wikicontent): return re.sub(whitespacereg, ' ', 
wikicontent)


def faultysentence(wikicontent): return re.sub(shortsentencereg, '', 
wikicontent)


def inserttags(wikicontent): return re.sub(punctuationreg, ' </s>\n<s>', wikicontent)


def noemptylines(wikicontent): return "".join([s for s in wikicontent.splitlines(True) if s.strip("\r\n")])


def noisbn(wikicontent): return re.sub(isbnreg, '', wikicontent)


def nospaceaftertag(wikicontent): return re.sub(nospaceaftertagreg, '<s> ', wikicontent)


# We only want articles written in Norwegian
wikipedia.set_lang("no")

#  initialize different counters for counting duplicates and uniques
idlist = []
duplicatecount = 0
uniquecount: int = 0
showuniquecount = 0

# define number of pages to get
for x in range(0, 10001):
    try:
        randompages = wikipedia.random(1)
        for page in randompages:

            #  get wikipedia page
            wikipage = wikipedia.page(page)

            #  get page ID
            pageid = wikipage.pageid

            #  check for ID-duplicate
            if pageid not in idlist:

                #  add ID to list of gotten pages
                idlist.append(pageid)
                uniquecount += 1
                showuniquecount += 1

                #  on every tenth iteration, print current unique count
                if showuniquecount == 10:
                    print("Current unique page count:{0}".format(uniquecount))
                    showuniquecount = 0
                wikicontent = wikipage.content

                # filter the content using different regex-functions
                filteredcontent = \
                    faultysentence(
                        noemptylines(
                            nospaceaftertag(
                                faultysentence(
                                    inserttags(
                                        nodoublewhitespace(
                                            noisbn(
                                                nospeacialchar(
                                                    wikicontent))))))))
                print(filteredcontent)

                # Write operation to file
                with open("wikiscraping2.txt", "a", encoding="utf-8") as the_file:
                    the_file.write('<s> ' + filteredcontent)
                    the_file.close()
            else:
                duplicatecount += 1
                print("Duplicate! Current duplicate count:{0}".format(duplicatecount))

    #  catch exception of wikipedia not knowing which page is specified
    except wikipedia.DisambiguationError as e:
        print('DisambiguationError!')

        #  continue to next
        continue

    # catch exception
    except wikipedia.exceptions.PageError as d:
        print('Index error! (Page could not be found)')

        # continue to next
        continue

Tags: ofthetoinreidforreturn