从网页中刮取时的索引器

2024-06-16 11:40:00 发布

您现在位置:Python中文网/ 问答频道 /正文

我一直在尝试使用此代码从xhamster频道中获取数据,以便进行研究

import json
from multiprocessing.dummy import Pool as ThreadPool

from lxml import html

from util import req


def get_channel_urls(url):
    r = req(url)
    tree = html.fromstring(r.text)
    print("Done", url)
    return [x.attrib['href'] for x in tree.xpath('//div[@class="item"]/a')]

def write_channel_data(url):
    r = req(url)
    html_text = r.text
    tree = html.fromstring(html_text)
    json_data = json.loads(
        tree.xpath('//script[@id="initials-script"]/text()')[0].strip().split("window.initials =")[1][:-1].strip())
    with open("channel_html/{}".format(json_data['sponsorChannel']['inurl']), 'w', encoding='utf-8') as outfile:
        outfile.write(html_text)
    print("Written data for:", url)


def main():
    letters = '0abcdefghijklmnopqrstuvqxyz'
    index_urls = ['https://xhamster.com/channels/all/{}'.format(index_letter) for index_letter in letters]
    index_urls.extend(['https://xhamster.com/gay/channels/all/{}'.format(index_letter) for index_letter in letters])
    index_urls.extend(['https://xhamster.com/shemale/channels/all/{}'.format(index_letter) for index_letter in letters])
    channel_urls = []
    for url in index_urls:
        channel_urls.extend(get_channel_urls(url))

    with open('channel_urls', 'w') as channel_url_backup_file:
        channel_url_backup_file.write("\n".join(channel_urls))

    # with open('channel_urls') as i:  # THIS IS TO READ A PRE-DOWNLOADED URL FILE
    #     channel_urls = [url.strip() for url in i.read().split()]

    with ThreadPool(processes=10) as pool:
        pool.map(write_channel_data, channel_urls)


if __name__ == '__main__':
    main()

它确实工作了一段时间,但后来我得到了这个错误。这个错误显然在main()函数中,但我不知道如何解决它 IndexError: list out of index


Tags: textinimportjsontreeurlfordata