在Python中创建了web抓取程序，需要将web链接保存到csv并删除重复项

import bs4 as BeautifulSoup import CSV import re import urllib.request from IPython.display import HTML # Program that scraps the website for r= urllib.request.urlopen('https://www.census.gov/programs- surveys/popest.html').read() soup = BeautifulSoup(r,"html.parser") for link in soup.find_all('a'): print(link.get('href')) with open("Giles_C996.csv","w") as csv_file: writer = csv.writer(csv_file,delimiter="/n") writer.writerow(Links) Close()

2条回答

网友

1楼 · 编辑于 2024-05-19 01:49:45

您有一些错误的导入，并调用了一个未定义的变量

我对iPython不是很熟悉，所以我不能对你对它的使用发表太多评论。而且urllibs总是有问题，所以我只使用了请求

我为csv文件的替代布局提供了一些废弃代码，以及一个可以帮助确定链接是否有效的函数，以及一个列表理解（如果您喜欢这种方法）。还可以为您打开csv文件

import csv, re, urllib.request, os
import requests
from bs4 import BeautifulSoup
# from IPython.display import HTML

def exists(link) -> bool:
    """
    Check if request response is 200
    """
    try:
        return 200 == requests.get(link).status_code
    except requests.exceptions.MissingSchema:
        return False
    except requests.exceptions.InvalidSchema:
        return False
    
def scrapeLinks(url):
    checked = set()
    page = requests.get(url).text
    soup = BeautifulSoup(page,"html.parser")
    for a in soup.find_all('a',href=True):
        link = a['href']
        if not link in checked and exists(link):
            yield link
        checked.add(link)
        
# Program that scrapes the website for 
url = 'https://www.census.gov/programs-surveys/popest.html'
# r = urllib.request.urlopen(url).read()
r = requests.get(url).text
soup = BeautifulSoup(r,"html.parser")

# links = [
    # a['href'] for a in soup.find_all('a',href=True)\
    # if exists(a['href'])
# ]

file_name = "Giles_C996.csv"
with open(file_name,"w") as csv_file:
    # writer = csv.writer(csv_file),delimiter="/n")
    writer = csv.writer(csv_file)
    # writer.writerow(set(links)) # conversion to remove duplicates
    writer.writerow(scrapeLinks(url)) 
    # writer.writerows(enumerate(scrapeLinks(url),1))  ## if you want a 2d-indexed collection

os.startfile(file_name)

# Close()

网友

2楼 · 编辑于 2024-05-19 01:49:45

您错误地导入了csv和bs4模块。而且Close()是不正确的。您可以使用到set的转换来消除重复项

import csv
import urllib.request
from bs4 import BeautifulSoup

r = urllib.request.urlopen('https://www.census.gov/programs-surveys/popest.html').read()    
soup = BeautifulSoup(r, "html.parser")
links = set([a['href'] for a in soup.find_all('a', href=True)])

with open("Giles_C996.csv", "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerows([link] for link in links)

输出为：

https://www.census.gov/programs-surveys/cps.html
/newsroom/press-releases/2020/65-older-population-grows/65-older-population-grows-spanish.html
https://www.census.gov/businessandeconomy
https://www.census.gov/data
/programs-surveys/popest/library.html

等等

相关问题更多 >

编程相关推荐

热门问题

热门文章