如何使所有数组长度相同?
我正在尝试用Python写一个网络爬虫,目的是在网站上寻找联系信息,然后把这些信息保存到Excel里。不过,这个程序似乎不太好使,我试了好几种方法。以下是我的代码:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from openpyxl import load_workbook
import re
import re
def scrape_contact_data(url):
# Function definition for scraping contact data
print("Scraping contact data from:", url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all text content within the webpage
all_text = soup.get_text(separator="\n", strip=True)
# Initialize lists to store data
company_names = []
street_numbers = []
postal_codes = []
countries = []
email_addresses = []
phone_numbers = []
fax_numbers = []
websites = []
# Search for patterns indicative of contact information
# Email addresses pattern
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
email_addresses.extend(re.findall(email_pattern, all_text))
print("Number of email addresses found:", len(email_addresses))
# Phone numbers pattern
phone_pattern = r'(?:(?:\+|0{0,2})\d{1,4}[\s.-]?)?(?:\(\d{1,5}\)[\s.-]?)?\d{3,5}[\s.-]?\d{3,4}[\s.-]?\d{3,4}'
phone_numbers.extend(re.findall(phone_pattern, all_text))
print("Number of phone numbers found:", len(phone_numbers))
# Fax numbers pattern
fax_pattern = r'Fax:\s*([\d\s/-]+)'
fax_numbers.extend(re.findall(fax_pattern, all_text))
print("Number of fax numbers found:", len(fax_numbers))
# Address pattern (assuming typical address format)
address_pattern = r'\b\d{1,5}\s+[\w\s]+\b'
addresses = re.findall(address_pattern, all_text)
for address in addresses:
# Split address into street number and name
street_number, street_name = address.split(maxsplit=1)
street_numbers.append(street_number)
# Extract postal code and country from the last part of the address
postal_code_country_pattern = r'(\d{4,})\s+([\w\s]+)$'
match = re.search(postal_code_country_pattern, street_name)
if match:
postal_codes.append(match.group(1))
countries.append(match.group(2))
else:
postal_codes.append('-')
countries.append('-')
print("Number of addresses found:", len(street_numbers))
print("Number of postal codes found:", len(postal_codes))
print("Number of countries found:", len(countries))
# Assuming company name is not readily available in the webpage content
company_names.extend(['-'] * len(email_addresses))
# Assuming website URLs are not readily available in the webpage content
websites.extend(['-'] * len(email_addresses))
# Create a DataFrame to store the data
data = {
'Company Name': company_names,
'Street & House number': street_numbers,
'Postal Code': postal_codes,
'Country': countries,
'Email Address': email_addresses,
'Phone Number': phone_numbers,
'Fax Number': fax_numbers,
'Website': websites
}
df = pd.DataFrame(data)
return df
def extract_email_from_subpage(subpage_url):
try:
response = requests.get(subpage_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
contact_block = soup.find('div', class_='cb-orte-item-adresse')
if contact_block:
email_element = contact_block.find('span', class_='data-emailencrypted').find_next('a', class_='value')
email = email_element.get('href').split(':')[-1] if email_element else "Nicht gefunden"
return email
else:
print("Kontaktblock nicht gefunden auf der Seite:", subpage_url)
return "-"
except Exception as e:
print("Fehler beim Abrufen der Seite:", e)
return "-"
def save_to_excel(df, filename='contact_data.xlsx'):
# Function definition for saving DataFrame to Excel
try:
# Load existing Excel file
wb = load_workbook(filename)
# Select the active worksheet
ws = wb.active
# Find the next empty row in the worksheet
next_row = ws.max_row + 1
# Write the DataFrame to the Excel file starting from the next empty row
for index, row in df.iterrows():
for col_idx, value in enumerate(row, start=1):
ws.cell(row=next_row+index, column=col_idx, value=value)
# Save the updated workbook
wb.save(filename)
print("Data appended to", filename)
except FileNotFoundError:
# If the file doesn't exist, create a new Excel file
df.to_excel(filename, index=False)
print("New file created:", filename)
if __name__ == "__main__":
urls = []
while True:
url = input("Enter a URL (or type 'done' to finish): ")
if url.lower() == 'done':
break
urls.append(url)
if not urls:
print("No URLs provided. Exiting.")
else:
# Initialize an empty DataFrame to store all contact data
all_contact_data = pd.DataFrame()
# Iterate through each URL, scrape contact data, and concatenate to the DataFrame
for url in urls:
print("Scraping data from:", url)
try:
contact_df = scrape_contact_data(url)
all_contact_data = pd.concat([all_contact_data, contact_df], ignore_index=True)
except Exception as e:
print(f"Error occurred while scraping data from {url}: {e}")
# Specify the location where you want to save the Excel file
save_location = "C:\\Users\\-----\\Desktop\\contact_data.xlsx" # Replace with your desired location
# Save all contact data to the specified location
save_to_excel(all_contact_data, filename=save_location)
这是输出的结果:找到的电子邮件地址数量:0
找到的电话号码数量:0
找到的传真号码数量:0
找到的地址数量:5
找到的邮政编码数量:5
找到的国家数量:5
在抓取“因明显原因已隐藏”的数据时发生错误:所有数组必须具有相同的长度。
我希望它能输出一块信息,内容像这样:
姓名
地址
邮政编码和城市
电话
传真
电子邮件
1 个回答
0
获取每个包含个人联系方式的元素,放在一个列表里。
# driver_table contains details about all the drivers
driver_table = soup.find('table', attrs = {'class': 'standing-table__table'})
table_body = driver_table.find('tbody')
# Get all the rows of driver details. Each row contain individual driver details
rows = table_body.find_all("tr", attrs = {'class':'standing-table__row'})
遍历这个列表,逐个提取每个人的所有联系方式(比如名字、邮箱等)。使用尝试和捕获的语句来提取每个细节,如果某个细节缺失,就在列表中添加一个""
(空白)。
try:
name.append(cells[1].text.strip()) # First td element is of name
except:
# Appending "" so the length of the list remains the same in case of missing data
name.append("")
这样做的话,如果有任何信息缺失,就会添加一个空白,这样所有联系方式列表的长度就会保持一致。
示例代码:
from bs4 import BeautifulSoup
import requests
import pandas as pd
r = requests.get("https://www.skysports.com/f1/standings")
soup = BeautifulSoup(r.content, 'html.parser')
name = []
team = []
country = []
driver_table = soup.find('table', attrs = {'class': 'standing-table__table'})
table_body = driver_table.find('tbody')
# Get all the rows of driver details
rows = table_body.find_all("tr", attrs = {'class':'standing-table__row'})
for row in rows:
# Get all the elements with driver details having the same class
cells = row.find_all("td", attrs = {'class':'standing-table__cell'})
try:
name.append(cells[1].text.strip()) # First td element is of name
except:
# Appending "" so the length of the list remains the same in case of missing data
name.append("")
try:
country.append(cells[2].text.strip()) # Second is of country
except:
country.append("")
try:
team.append(cells[3].text.strip()) # Third is of teams
except:
team.append("")
data = {
'Name': name, 'Country': country, 'Team': team
}
df = pd.DataFrame(data)
print(df)