关于Python Web

def analyze(url): '''returns the list of http links in absolute format in the web page with URL url''' print('Visiting: ', url) # for testing # obtain links in the web page content=urlopen(url).read().decode() collector=Collector(url) collector.feed(content) urls = collector.getLink() # compute word frequencies content=collector.getData() freq=frequency(content) out=open('test.csv', 'a') print(out, 'URL', 'word', 'count') csv=writer(out) #print the frequency of every text data word in web page print('\n {:50}{:10}{:5}'.format('URL', 'word', 'count')) for word in freq: row1=(url, word, freq[word]) print('\n {:50} {:10} {:5}'.format(url, word, freq[word])) csv.writerow(row1) print('\n {:50} {:10}'.format('URL', 'link')) for link in urls: print('\n {:50} {:10}'.format(url, link)) row2=(url, link) csv.writerow(row2) return urls class Crawler: 'a web crawler' def __init__(self): self.visited = set() self.prohibited=['*google.com/*','*yahoo.com/*'] def crawl(self, url): '''calls analyze() on web page url and calls itself on every link to an univisted webpage''' links=analyze(url) self.visited.add(url) for link in links: if link not in self.visited and self.prohibited: try: self.crawl(link) except: pass

1条回答

网友

1楼 · 发布于 2024-04-19 20:28:34

link not in self.visited and self.prohibited主要等价于link not in self.visited，因为在这个语句中，self.prohibited总是被计算成{}。（self.prohibited是非空列表）

我想你应该把self.prohibited替换成：not any(re.match(x, link) for x in self.prohibited)。对于每个禁止的regexp，此代码检查链接是否与regexp匹配。在

相关问题更多 >

编程相关推荐

热门问题

热门文章