如何在python代码中使用list:content对进行输出?

2024-04-25 13:26:13 发布

您现在位置:Python中文网/ 问答频道 /正文

我一直在为this website开发python网络爬虫。我做了两个函数,分别工作得很好。你知道吗

一个是收集股票和

另一个是收集每个列表的内容数据。你知道吗

我想用成对的

"list#1/content#1"

"list#2/content#2"

"list#3/content#3"

为了实现这一点,我的代码中需要修改什么?你知道吗

谢谢。你知道吗

from bs4 import BeautifulSoup
import urllib.request

CAR_PAGE_TEMPLATE = "http://www.bobaedream.co.kr/cyber/CyberCar.php?gubun=I&page="
BASE_PAGE = 'http://www.bobaedream.co.kr'

def fetch_post_list():

    for i in range(20,21):
        URL = CAR_PAGE_TEMPLATE + str(i)
        res = urllib.request.urlopen(URL)
        html = res.read()
        soup = BeautifulSoup(html, 'html.parser')
        table = soup.find('table', class_='cyber')
        #print ("Page#", i)

        # 50 lists per each page
        lists=table.find_all('tr', itemtype="http://schema.org/Article")

        count=0
        for lst in lists:
            if lst.find_all('td')[3].find('em').text:
                lst_price=lst.find_all('td')[3].find('em').text
                lst_title=lst.find_all('td')[1].find('a').text
                lst_link = lst.find_all('td')[1].find('a')['href']
                lst_photo_url=''
                if lst.find_all('td')[0].find('img'):
                    lst_photo_url = lst.find_all('td')[0].find('img')['src']
                count+=1
            else: continue

            #print('#',count, lst_title, lst_photo_url, lst_link, lst_price)

    return lst_link

def fetch_post_content(lst_link):

    URL = BASE_PAGE + lst_link
    res = urllib.request.urlopen(URL)
    html = res.read()
    soup = BeautifulSoup(html, 'html.parser')

    #Basic Information
    table = soup.find('div', class_='rightarea')

    # Number, Year, Mileage, Gas Type, Color, Accident
    content_table1 = table.find_all('div')[0]
    dds = content_table1.find_all('dd')
    for dd in dds:
        car_span_t = dd.find_all('span', {'class': 't'})[0]
        car_span_s = dd.find_all('span', {'class': 's'})[0]
        #print(car_span_t.text, ':', car_span_s.text)

    # Seller Information
    content_table2 = table.find_all('div')[1]
    dds2 = content_table2.find_all('dd')
    for dd2 in dds2:
        seller_span_t = dd.find_all('span', {'class': 't'})[0]
        seller_span_s = dd.find_all('span', {'class': 's'})[0]
        #print(seller_span_t.text, ':', seller_span_s.text)

    return dds

Tags: textforhtmlpagetablelinkcontentall