靓汤解析功能不再工作

2024-04-25 08:16:44 发布

您现在位置：Python中文网/ 问答频道 /正文

8285

网友

男 | 程序猿一只，喜欢编程写python代码。

我去年写了一个网络刮刀，从一个随机的维基百科页面开始，跟随第一段的第一个链接，然后刮刀下一个页面，重复这个过程，直到到达哲学页面。此脚本是this problem的解决方案。你知道吗

在我的脚本中，代码从固定数量的随机页面开始，对于每个页面，导航到哲学页面，记录从开始到结束（从随机页面到哲学页面）的路径长度。然后使用matplotlib绘制这些长度。你知道吗

去年初我测试脚本时，脚本的所有步骤都正常工作，但今天我第一次运行它时，由于某种原因，beautiful soup解析代码没有从文章正文中提取任何链接，甚至没有提取任何要解析的段落。自从上次运行以来，我没有更改任何代码，而且wikipedia页面DOM的html标记似乎与以前一样，那么是美丽的汤库发生了变化，还是wikipedia DOM最近有什么微妙的变化？

脚本的代码如下：

import sys
import json
from urlparse import urljoin

import requests
from lxml.html import fromstring
from bs4 import BeautifulSoup,NavigableString, Tag
import matplotlib.pyplot as plt
import scipy
import scipy.stats

reload(sys)
sys.setdefaultencoding('utf-8')


class Crawler:
    """ Class used to crawl wikipedia pages starting from a random article."""
    def __init__(self):
        self.base_url = "https://en.wikipedia.org"
        self.NUM_PAGES_TO_CRAWL = 2
    def get_valid_link(self, curr_response):
        """Takes an html response and returns the first link in the main body of the article."""
        curr_root = BeautifulSoup(curr_response.text,"lxml")
        first = curr_root.select_one("#mw-content-text") # locate main body
        if not first:
            return None
        par = first.find_all("p",recursive = False,limit = 10)
        print par
        heading = curr_root.select_one("#firstHeading").text
        heading = reformat_string('(',heading)
        first_paragraph_found = False
        head_tokens = tokenize(heading)

        # Find which paragraph has the first link
        i = 0
        for i in range(len(par)):
            if par[i].b is not None:
                bold = ""
                for string in par[i].find_all("b"):
                    bold += " " + string.text
                bold = reformat_string('(', bold)
                bold_tokens = tokenize(bold)
                heading_match = check_name_match(head_tokens,bold_tokens)
                if heading_match:
                    first_paragraph_found = True
                if heading_match and par[i].a:
                    break
            if par[i].a is not None:
                anchor = par[i].a.text
                if anchor:
                    anchor = reformat_string('(', anchor)
                    a_tokens = tokenize(anchor)
                    heading_match = check_name_match(head_tokens,a_tokens)
                    if heading_match:
                        break
            if first_paragraph_found and par[i].a:
                break   
            i += 1

        # if none of the paragraphs have a link and article contains only a list
        if i >= len(par)-1 and first_paragraph_found:
            u_list = first.find_all('ul')
            try:
                return u_list[0].li.a['href']
            except (IndexError, AttributeError,TypeError):
                return None
        elif i >= len(par)-1:# Reached article with no main body
            return None

        main_body_idx = i
        stack = []
        # Find the first link before or after parentheses 
        for child in par[main_body_idx].children:
            if isinstance(child,NavigableString):
                if "(" in child:
                    stack.append("(")
                if ")" in child:
                    try:
                        stack.pop()
                    except IndexError: # html malformed
                        return None

            if isinstance(child, Tag) and child.name == "a" and not stack:
                link = child['href']        
                link = reformat_string('#',link)
                try:
                    return str(link)
                except KeyError: # Reached article with no main body
                    return None

    def crawl_to_philosophy(self, start_url,session):
        """Follow the path of each url until the philosophy page is reached and return the path."""
        link_path = []
        # Get first link
        try:
            init_response = session.get(start_url)
        except requests.exceptions.RequestException as e: # bad link
            return None

        init_link = self.get_valid_link(init_response)
        if not init_link:
            return None
        link_path.append(urljoin(self.base_url, init_link))

        # Follow path of links until the philosophy page is reached
        i = 0
        while True:
            if "philosophy" in  link_path[i].lower():
                break
            try:
                curr_response = session.get(link_path[i])
            except requests.exceptions.RequestException as e: # bad link
                return None 

            curr_link = self.get_valid_link(curr_response)
            if not curr_link or "redlink" in curr_link:
                return None
            new_link = urljoin(self.base_url, curr_link)
            for i in range(len(link_path)):
                if new_link in link_path[i] : # loop found
                    return None
            link_path.append(new_link)
            i += 1
        return link_path

    def find_paths_to_philosophy(self,url):
        """Find paths starting from 500 links."""
        i = 0
        crawl_list = []
        with requests.Session() as s:
            while i < self.NUM_PAGES_TO_CRAWL:
                path = self.crawl_to_philosophy(url,s)
                if path is not None:
                    crawl_list.append(len(path))
                    i += 1
            plot_lengths(crawl_list)


def plot_lengths(lens):
    """Plot the distribution of path lengths."""
    freq = {}
    max_len = 0

    for length in lens:
        max_len = max(length,max_len)
        if length in freq:
            freq[length] += 1
        else:
            freq[length] = 1
    max_freq = max(freq.values())
    bins = range(0, max_len + 1, 2)
    plt.hist(lens,bins,histtype = 'bar',rwidth = 0.8)
    plt.xlabel('x')
    plt.ylabel('Path Lengths')
    plt.title('Distribution of path lengths')
    dist_names = ['gamma', 'beta', 'rayleigh', 'norm', 'pareto']

    for dist_name in dist_names:
        dist = getattr(scipy.stats, dist_name)
        param = dist.fit(lens)
        pdf_fitted = dist.pdf(bins, *param[:-2], loc=param[-2], scale=param[-1]) * len(lens)
        plt.plot(bins,pdf_fitted, label=dist_name)
        plt.xlim(0,max_len)
        plt.ylim(0,max_freq)
    plt.legend(loc='upper right')
    plt.show()


# Utility functions used by Crawler class

def reformat_string(char, word):
    """Remove passed in char from a string and convert its characters to lowercase."""
    word = word.lower()
    char_idx = word.find(char)
    if char_idx != -1:
        return word[:char_idx]
    return word

def check_name_match(heading, string):
    """Determine whether or not any part of the article heading is in the string and vice versa."""
    for i in range(len(string)):
        for j in range(len(heading)):
            if heading[j] in string[i] or string[i] in heading[j]:
                return True
    return False

def tokenize(word):
    """Split the passed in 'word' on space characters and return a list of tokens."""
    tokens = []
    curr_word = ""
    for i in range(len(word)):
        if word[i] == " " and i == len(word)-1:
            tokens.append(word.strip(" "))
            return tokens
        curr_word += word[i]
        if word[i] == " " :
            tokens.append(curr_word)    
            curr_word = ""
            i+=1
        if i == len(word)-1:
            tokens.append(curr_word)    
            return tokens


if __name__ == "__main__":
    url = "https://en.wikipedia.org/wiki/Special:Random"
    crawler = Crawler()
    crawler.find_paths_to_philosophy(url)

脚本中不起作用的行如下：

curr_root = BeautifulSoup(curr_response.text,"lxml")
first = curr_root.select_one("#mw-content-text") # locate main body
if not first:
    return None
par = first.find_all("p",recursive = False,limit = 10)
heading = curr_root.select_one("#firstHeading").text
heading = reformat_string('(',heading)
first_paragraph_found = False
head_tokens = tokenize(heading)

编辑：

该代码不适用于所选的任何随机链接，但此代码不适用于特定的链接，例如：en.wikipedia.org/wiki/Modern\u希腊语你知道吗

Tags： the path in none string len return if

0条回答

目前没有回答

靓汤解析功能不再工作

编辑：

相关问题更多 >

编程相关推荐

热门问题

热门文章

靓汤解析功能不再工作

编辑：

相关问题 更多 >

编程相关推荐

热门问题

热门文章

相关问题更多 >