查找网站中不存在的单词

import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from list_loop import * import re word_to_find = 'pharmacy' class TestSpider(CrawlSpider): name = 'test' # these are lists of a lot of domains imported from another # file called list_loop.py allowed_domains = strip_url start_urls = merch_url rules = ( Rule(LinkExtractor(), callback='parse_item', follow=True), ) def parse_item(self, response): # Here I clean up the parsed text not to include /n or whitespace. words = response.xpath("//a//text()").getall() cleaned_words = [word.strip() for word in words] cleaned_words = [word.lower() for word in cleaned_words if len(word) > 0] # Then I loop through the cleaned_words in order to find a match for single_word in cleaned_words: re.search(r'\b%s\b' % word_to_find, single_word) yield{ 'Matching': 'Found the word {} in {}'.format(word_to_find, response.url) } else: pass

{"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"}, {"Matching": "Found the word pharmacy in https://www.alibaba.com/?from_http=1"},

1条回答

网友

1楼 · 发布于 2024-05-12 22:04:15

我相信你错过了一个if声明。在您的代码中，无论是否存在匹配项，您都将生成该语句

    for single_word in cleaned_words:
        re.search(r'\b%s\b' % word_to_find, single_word)
        yield{
            'Matching': 'Found the word {} in {}'.format(word_to_find, response.url)
        }

我相信你想要这样的东西：

    for single_word in cleaned_words:
        if re.search(r'\b%s\b' % word_to_find, single_word):
            yield{
                'Matching': 'Found the word {} in {}'.format(word_to_find, response.url)
            }

相关问题更多 >

编程相关推荐

热门问题

热门文章