python中的多线程爬虫

import urllib.request, re, threading, csv from queue import Queue from bs4 import BeautifulSoup from sys import exit class a3_6: __url_q = Queue(100) __html_q = Queue() __data_q = Queue() __visited_urls = [] def __init__(self, start_url, max_threads): self.__url_q.put(start_url) self.max_threads = max_threads def gethtml(self,url): try: req=urllib.request.Request(url) html=urllib.request.urlopen(req).read() self.__html_q.put(html) except urllib.error.URLError as e: print(e.reason) except: print("invalid: " + url) self.__visited_urls.append(url) def mine_thread(self): while True: if not self.__html_q.empty(): soup = BeautifulSoup(self.__html_q.get(),"html.parser") for a in soup.find_all('a', href=True): if a not in self.__visited_urls: link='https://en.wikipedia.org'+a.get('href') self.__url_q.put(link) self.__data_q.put(link) else: break def store(self): while True: if not self.__data_q.empty(): print (self.__data_q.get()) def download_thread(self): while True: if not self.__url_q.empty(): self.gethtml(self.__url_q.get()) else: break def run(self): self.download_thread() self.mine_thread() self.store() def op(self): for x in range(self.max_threads): t = threading.Thread(target=self.run) t.daemon = True t.start() self.store() if __name__ == '__main__': a=a3_6('https://en.wikipedia.org/wiki/Main_Page', 5) a.op()

1条回答

网友

1楼 · 发布于 2024-04-25 14:37:41

我找到了解决办法。我接受了詹姆斯·哈里森的帮助。我不知道他为什么删除了原来的解决方案，但它在这里

import urllib.request, threading
from queue import Queue
from bs4 import BeautifulSoup
from sys import exit
from a3_3 import store_to_db

class a3_5:

    __url_q = Queue(100)
    __html_q = Queue()
    __data_q = Queue()
    __visited_urls=[]

    def gethtml(self,url):
        try:
            req=urllib.request.Request(url)
            html=urllib.request.urlopen(req).read()
            self.__html_q.put(html)
            pars=urlparse(url)
        except urllib.error.URLError as e:
            print(e.reason+':'+url)
        except:
            print("invalid: " + url)

    def mine_thread(self):
        while True:
            if not self.__html_q.empty():
                soup = BeautifulSoup(self.__html_q.get(),"html.parser")
                for a in soup.find_all('a', href=True):
                    link=a.get('href')
                    """if not link.startswith('www'):
                        link=self.__prfx+link"""
                    if link not in self.__visited_urls:
                        self.__url_q.put(link)
                        self.__data_q.put(link)
            else:
                break

    def store(self):
        while True:
            if not self.__data_q.empty():
                cont=self.__data_q.get()
                print (cont)
            else:
                break

    def download_thread(self):
        while True:
            if not self.__url_q.empty():
                self.gethtml(self.__url_q.get())
                self.__url_q.task_done()

    def op(self,*urls):
        for x in range(25):
            d = threading.Thread(target=self.download_thread)
            d.setDaemon(True)
            d.start()
        for url in urls:
            self.__url_q.put(url)
        self.__url_q.join()
        self.mine_thread()
        self.store()

if __name__ == '__main__':
    urls=['https://en.wikipedia.org/wiki/Bajirao']#,'https://en.wikipedia.org/wiki/Malharrao_Holkar','https://en.wikipedia.org/wiki/Ranoji_Scindia']
    a=a3_5()
    a.op(*urls)

基本上，我必须安排另一个队列，在那里我必须设置工人来激活线程。另外，mine\u线程和store方法需要在download\u线程方法完成后启动，因为这些值不会被存储。你知道吗

相关问题更多 >

编程相关推荐

热门问题

热门文章