使用Python和BeautifulSoup进行多页动态页面的web浏览

2024-05-15 20:47:46 发布

您现在位置:Python中文网/ 问答频道 /正文

我想网上刮预定义的链接。 我想从https://www.outdooractive.com/de/抓取徒步旅行,在一个特定的区域,所以我用20个链接定义了这个区域。到现在为止,一直都还不错。我得到了一个链接的数据,但是当我试图在页面列表中循环时,它只经过一个链接。我希望这只是我在逻辑思维上的无能。如果有人能帮我,我会很高兴的。在

这是我的密码。只有三个环节不是全部。在

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs


webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.54301,48.94731')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.5443,48.88763")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.4589,48.93163")


for Page in webliste:

        class Page(QWebEnginePage):
            def __init__(self, url):
                self.app = QApplication(sys.argv)
                QWebEnginePage.__init__(self)
                self.html = ''
                self.loadFinished.connect(self._on_load_finished)
                self.load(QUrl(url))
                self.app.exec_()

            def _on_load_finished(self):
                self.html = self.toHtml(self.Callable)
                print('Load finished')

            def Callable(self, html_str):
                self.html = html_str
                self.app.quit()



page = Page(webliste[0+1])   

filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")


headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";"  + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";"  + "Anbieter\n"

f.write(headers)



def main():

    soup = bs.BeautifulSoup(page.html, 'html.parser')

    containers = soup.findAll("div", {"class":"oax_dp_snippet"})

    print ("Anzahl der gefundenen touren", len(containers))
#loop

    for container in containers:

        tour_container = container.findAll("span",{"dir":"auto"})
        cont = tour_container[0].text
        print("Name der Tour: ", cont)
        tour_name = cont

        tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
        leange = tour_data[0].text.strip()
        zeit = tour_data[1].text.strip()
        aufstieg = tour_data[2].text.strip()
        abstieg = tour_data[3].text.strip()

        print("Länge der Tour: ", leange)
        print("Länge der Tour: ", zeit)
        print("Länge der Tour: ", aufstieg)
        print("Länge der Tour: ", abstieg)

        link = container.a["href"]
        link_a = link
        print ("Link zur Tour: ", link)

        tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
        anbieter = tour_anbieter[0].text.strip()
        print("Tourenanbieter: ", anbieter)
        f.write(tour_name + ";" + leange + ";" + zeit + ";"  + aufstieg + ";" + abstieg + ";" + link+ ";"  + anbieter+ "\n")

    f.close()



if __name__ == '__main__': main()

Tags: textimportselfdatacontainerhtmllinkstrip
1条回答
网友
1楼 · 发布于 2024-05-15 20:47:46

@Steve Haigh谢谢你给我的第二条建议是最好的。知道一切都有用。我知道它不是很性感,但它很管用;)

import bs4 as bs
import sys
import urllib.request
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
import codecs


class Page(QWebEnginePage):

    def __init__(self, url):
        self.app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.html = ''
        self.loadFinished.connect(self._on_load_finished)
        self.load(QUrl(url))
        self.app.exec_()

    def _on_load_finished(self):
        self.html = self.toHtml(self.Callable)
        print('Load finished')

    def Callable(self, html_str):
        self.html = html_str
        self.app.quit()

webliste = []
webliste.append('https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.32663,49.07201')
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.30002,49.0945")
webliste.append("https://www.outdooractive.com/de/touren/#cat=Wanderung&view=listMap&wt=Nationalpark%20Bayerischer%20Wald%20(94258%20Frauenau%20Germany)&zc=14,13.22097,49.11664")




filename = "WandertourenLinks.csv"
f = codecs.open(filename, "w","utf-8")


headers ="Tour Name" + ";" + "Länge" + ";" + "Zeit" + ";"  + "Aufstieg" + ";" + "Abstieg" + ";" + "Link zur Tour"+ ";"  + "Anbieter\n"

f.write(headers)

def main():

    for i in range(3):

        page = Page(webliste[i])
        soup = bs.BeautifulSoup(page.html, 'html.parser')
        containers = soup.findAll("div", {"class":"oax_dp_snippet"})
        print ("Anzahl der gefundenen touren", len(containers))

        for container in containers:
            tour_container = container.findAll("span",{"dir":"auto"})
            cont = tour_container[0].text
            print("Name der Tour: ", cont)
            tour_name = cont


            tour_data = container.findAll("div",{"class":"oax_tour_data oax_fl"})
            leange = tour_data[0].text.strip()
            zeit = tour_data[1].text.strip()
            aufstieg = tour_data[2].text.strip()
            abstieg = tour_data[3].text.strip()

            print("Länge der Tour: ", leange)       
            print("Länge der Tour: ", zeit)
            print("Länge der Tour: ", aufstieg)
            print("Länge der Tour: ", abstieg)

            link = container.a["href"]
            link_a = link
            print ("Link zur Tour: ", link)

            tour_anbieter = container.findAll("div",{"class":"oax_var_pos oax_var_pos_bottom oax_font_smaller oax_line_height_14 oax_ellipsis"})
            anbieter = tour_anbieter[0].text.strip()
            print("Tourenanbieter: ", anbieter)



            f.write(tour_name + ";" + leange + ";" + zeit + ";"  + aufstieg + ";" + abstieg + ";" + link+ ";"  + anbieter+ "\n")











    f.close()



if __name__ == '__main__': main()

相关问题 更多 >