网页抓取不工作,即使没有错误
我想写一段Python代码来抓取这些数据(https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)。
这是我的代码:
import os
import requests
import random
import time
import pyarrow.parquet as pq
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from fake_useragent import UserAgent
# URL de la page contenant les liens vers les datasets
base_url = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "html.parser")
# Chemin où enregistrer les fichiers
download_directory = "C:/Users/flosr/Engineering/Blent.ai Project/datas"
# Fonction pour télécharger un fichier avec un en-tête utilisateur aléatoire et une pause aléatoire
def download_file(url, file_path):
user_agent = UserAgent().random
headers = {"User-Agent": user_agent}
time.sleep(random.uniform(1, 3)) # Ajouter une pause aléatoire entre 1 et 3 secondes
response = requests.get(url, headers=headers)
with open(file_path, "w") as f:
f.write(response.content)
# Parcourir chaque section contenant les liens pour chaque année
for section in soup.find_all("div", class_="faq-answers"):
year = section.find_previous_sibling("div", class_="faq-questions").text.strip()
print(f"Downloading datasets for year {year}...")
# Créer un sous-répertoire pour chaque année
year_directory = os.path.join(download_directory, year)
os.makedirs(year_directory, exist_ok=True)
# Télécharger les fichiers pour chaque mois de l'année
for link in section.find_all("a"):
file_url = urljoin(base_url, link.get("href"))
filename = os.path.basename(file_url)
file_path = os.path.join(year_directory, filename)
# Télécharger le fichier
print(f"Downloading {filename}...")
download_file(file_url, file_path)
# Convertir le fichier Parquet
pq.write_table(pq.read_table(file_path), file_path.replace('.parquet', '.csv'))
print("Download and conversion complete.")
这是输出结果:
PS C:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code> & 'c:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code\env\Scripts\python.exe' 'c:\Users\flosr\.vscode\extensions\ms-python.debugpy-2024.2.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher' '63645' '--' 'C:\Users\flosr\Engineering\Blent.ai Project\datas\WebScraping Code\env\main.py'
Download and conversion complete.
但是,在指定的文件夹里什么都没有显示。没有出现错误,但它还是不工作。而且不知为什么,它一直在安装下面的依赖项,根本停不下来。
如果没有任何错误提示,我就无法尝试解决问题。
1 个回答
1
看起来有些网址的末尾有空格字符,这些空格需要去掉:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:124.0) Gecko/20100101 Firefox/124.0"
}
def save_url(url, path):
response = requests.get(url, headers=headers, stream=True)
total_size = int(response.headers.get("content-length", 0))
with tqdm(total=total_size, unit="B", unit_scale=True) as progress_bar:
with open(path, "wb") as file:
for data in response.iter_content(1024):
progress_bar.update(len(data))
file.write(data)
url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a in soup.select("table a"):
month = a.find_previous("strong").get_text(strip=True)
year = a.find_previous(attrs={"data-answer": True}).get_text(strip=True)
u = a["href"].strip() # <-- important part!
path = f'{year}_{month}_{u.split("/")[-1]}'
print(year, month, u, f"Saving to {path}...")
save_url(u, path)
print("\n", "-" * 80)
输出结果是:
...
100%|█████████████| 50.0M/50.0M [00:01<00:00, 37.1MB/s]
--------------------------------------------------------------------------------
2024 January https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-01.parquet Saving to 2024_January_green_tripdata_2024-01.parquet...
100%|█████████████| 1.36M/1.36M [00:00<00:00, 5.32MB/s]
--------------------------------------------------------------------------------
2024 January https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2024-01.parquet Saving to 2024_January_fhv_tripdata_2024-01.parquet...
100%|█████████████| 15.0M/15.0M [00:00<00:00, 42.1MB/s]
--------------------------------------------------------------------------------
2024 January https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet Saving to 2024_January_fhvhv_tripdata_2024-01.parquet...
100%|█████████████| 473M/473M [00:15<00:00, 31.0MB/s]
--------------------------------------------------------------------------------
2023 January https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet Saving to 2023_January_yellow_tripdata_2023-01.parquet...
100%|█████████████| 47.7M/47.7M [00:01<00:00, 42.4MB/s]
--------------------------------------------------------------------------------
2023 January https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet Saving to 2023_January_green_tripdata_2023-01.parquet...
100%|█████████████| 1.43M/1.43M [00:00<00:00, 46.7MB/s]
--------------------------------------------------------------------------------
...