Scraper无法打印所有结果

import re ; import requests ; from lxml import html base = "http://bangalore.craigslist.co.in" url_list = [ 'http://bangalore.craigslist.co.in/reb/d/flat-is-for-sale-at-cooke-town/6266183606.html', 'http://bangalore.craigslist.co.in/reb/d/prestige-sunnyside/6259128505.html', 'http://bangalore.craigslist.co.in/reb/d/jayanagar-2nd-block-4000-sft/6221720477.html', 'http://bangalore.craigslist.co.in/reb/d/prestige-ozone-type-3-r-villa/6259928614.html', 'http://bangalore.craigslist.co.in/reb/d/zed-homes-3-bedroom-flat-for/6257075793.html' ] def get_link(medium_link): response = requests.get(medium_link).text tree = html.fromstring(response) try: name = tree.cssselect('span#titletextonly')[0].text except IndexError: name = "" try: link = base + tree.cssselect('a.showcontact')[0].attrib['href'] except IndexError: link = "" parse_doc(name, link) def parse_doc(title, ano_page_link): if ano_page_link: page = requests.get(ano_page_link).text tel = re.findall(r'\d{10}', page)[0] if re.findall(r'\d{10}', page) else "" print(title, tel) if __name__ == '__main__': for link in url_list: get_link(link)

A Flat is for sale at Cooke Town Prestige Sunnyside Jayanagar 2nd Block, 4000 sft Plot for Sale 9845012673 PRESTIGE OZONE TYPE D 3 B/R VILLA FOR SALE 9611226364 T ZED HOMES 3 BEDROOM FLAT FOR SALE 9611226364

2条回答

网友

1楼 · 编辑于 2024-04-25 21:15:08

注意，例如，在http://bangalore.craigslist.co.in/reb/d/flat-is-for-sale-at-cooke-town/6266183606.html上没有与'a.showcontact'选择器匹配的链接，因此下面的块

try:
    link = base + tree.cssselect('a.showcontact')[0].attrib['href']
except IndexError:
    link = ""

将返回link = ""

然后，当调用if ano_page_link:时，if块中的所有命令都会被忽略，因为条件if ""是False，不会输出任何内容

您可以尝试以下方法：

def parse_doc(title, ano_page_link):

    if ano_page_link:
        page = requests.get(ano_page_link).text            
        tel = re.findall(r'\d{10}', page)[0] if re.findall(r'\d{10}', page) else ""
        print(title, tel)
    else:
        print(title)

网友

2楼 · 编辑于 2024-04-25 21:15:08

通过将收集数据和打印数据这两个任务分开，您可以获得更大的灵活性。以后当您想扩展时，添加更多信息会更容易。你知道吗

def collect_info(medium_link):
    response = requests.get(medium_link).text
    tree = html.fromstring(response)

    title = get_title(tree)
    contact_link = get_contact_link(tree)
    tel = get_tel(contact_link) if contact_link else ''

    return title, tel


def get_title(tree):
    try:
        name = tree.cssselect('span#titletextonly')[0].text
    except IndexError:
        name = ""
    return name

def get_contact_link(tree):
    try:
        link = base + tree.cssselect('a.showcontact')[0].attrib['href']
    except IndexError:
        link = ""
    return link

def get_tel(ano_page_link):
    page = requests.get(ano_page_link).text
    tel = re.findall(r'\d{10}', page)[0] if re.findall(r'\d{10}', page) else ""
    return tel

def print_info(title, tel):
    if tel:
        fmt = 'Title: {title}, Phone: {tel}'
    else:
        fmt = 'Title: {title}'
    print(fmt.format(title=title, tel=tel))

if __name__ == '__main__':
    for link in url_list:
        title, tel = collect_info(link)
        print_info(title, tel)

相关问题更多 >

编程相关推荐

热门问题

热门文章