from collections import Counter
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/tag/humor/',
]
def parse(self, response):
for text in response.xpath('//body//*//text()').extract():
# Eliminate empty strings
words_ = (item.strip() for item in text.strip().split(' '))
words = [item for item in words_ if item]
if any(words):
yield Counter(words)
然后在另一个名为scrapy_runner.py的文件中输入以下代码:
import os
import subprocess
import shlex
import json
from functools import reduce
from operator import add
from collections import Counter
from pprint import pprint
FILENAME = 'counters.json'
SCRIPTNAME = 'words_spider.py'
try:
os.remove(FILENAME)
except FileNotFoundError:
pass # No file to remove
# Run the spider.
subprocess.check_call(shlex.split(f'scrapy runspider {SCRIPTNAME} -o {FILENAME}'))
with open(FILENAME) as fh:
# Create counters out of saved JSON file.
counts = (Counter(item) for item in json.load(fh))
# Add all the counters together.
pprint(reduce(add, counts), indent=4)
我以前从未使用过scrapy,但我想我有一个解决方案来计算HTML正文中的所有单词。你知道吗
在名为
words_spider.py
的文件中输入以下代码:然后在另一个名为
scrapy_runner.py
的文件中输入以下代码:运行脚本:
python scrapy_runner.py
:输出为:
相关问题 更多 >
编程相关推荐