美丽的汤，如何刮多个网址，并保存在一个csv-fi

import requests import csv from bs4 import BeautifulSoup import pandas as pd import re from datetime import timedelta import datetime import time urls = ['https://store.steampowered.com/search/?specials=1&page=1', 'https://store.steampowered.com/search/?specials=1&page=2', 'https://store.steampowered.com/search/?specials=1&page=3', 'https://store.steampowered.com/search/?specials=1&page=4','https://store.steampowered.com/search/?specials=1&page=5'] for url in urls: my_url = requests.get(url) html = my_url.content soup = BeautifulSoup(html,'html.parser') data = [] ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') for container in soup.find_all('div', attrs={'class':'responsive_search_name_combined'}): title = container.find('span',attrs={'class':'title'}).text if container.find('span',attrs={'class':'win'}): win = '1' else: win = '0' if container.find('span',attrs={'class':'mac'}): mac = '1' else: mac = '0' if container.find('span',attrs={'class':'linux'}): linux = '1' else: linux = '0' data.append({ 'Title':title.encode('utf-8'), 'Time':st, 'Win':win, 'Mac':mac, 'Linux':linux}) with open('data.csv', 'w',encoding='UTF-8', newline='') as f: fields = ['Title','Win','Mac','Linux','Time'] writer = csv.DictWriter(f, fieldnames=fields) writer.writeheader() writer.writerows(data) testing = pd.read_csv('data.csv') heading = testing.head(100) discription = testing.describe() print(heading)

2条回答

网友

1楼 · 编辑于 2024-04-20 14:28:16

所以我显然对我的代码视而不见，当你整天盯着它看的时候，就会发生这种情况。实际上我所要做的就是将“data=[]”移到for循环的上方，这样它就不会每次都重置了。在

网友

2楼 · 编辑于 2024-04-20 14:28:16

问题是在每个url之后重新初始化数据。然后在最后一次迭代之后编写它，这意味着您将始终拥有从上一个url获得的最后一个数据。每次迭代后，您都需要附加数据，而不是覆盖这些数据：

import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import timedelta
import datetime
import time

urls = ['https://store.steampowered.com/search/?specials=1&page=1', 'https://store.steampowered.com/search/?specials=1&page=2', 'https://store.steampowered.com/search/?specials=1&page=3', 'https://store.steampowered.com/search/?specials=1&page=4','https://store.steampowered.com/search/?specials=1&page=5']

results_df = pd.DataFrame() #<  initialize a results dataframe to dump/store the data you collect after each iteration
for url in urls:   
    my_url = requests.get(url) 
    html = my_url.content
    soup = BeautifulSoup(html,'html.parser')

    data = []  #<  your data list is "reset" after each iteration of your urls
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') 

    for container in soup.find_all('div', attrs={'class':'responsive_search_name_combined'}):
        title = container.find('span',attrs={'class':'title'}).text

        if container.find('span',attrs={'class':'win'}):
            win = '1'
        else:
            win = '0'

        if container.find('span',attrs={'class':'mac'}):
            mac = '1'
        else:
            mac = '0'

        if container.find('span',attrs={'class':'linux'}):
            linux = '1'
        else:
            linux = '0'

        data.append({
            'Title':title,
            'Time':st,
            'Win':win,
            'Mac':mac,
            'Linux':linux})

        temp_df = pd.DataFrame(data) #<  temporary storing the data in a dataframe
        results_df = results_df.append(temp_df).reset_index(drop=True) #<  dumping that data into a results dataframe


results_df.to_csv('data.csv', index=False) #<  writing the results dataframe to csv

testing = pd.read_csv('data.csv')
heading = testing.head(100)
discription = testing.describe()
print(heading)

输出：

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章