如何将CSV导入到数据由具有父/子关系的索引列组织的位置？

'Acct01','Freds Autoshop','3-way-Cntrl','Y',1000,576,686,837 'Acct01','Freds Autoshop','3-way-Cntrl','Y',1001,683,170,775 'Acct02','Daves Tacos',2,'centrifugal','N',1000,334,787,143 'Acct02','Daves Tacos',2,'centrifugal','N',1001,749,132,987

import pandas as pd import numpy as np import time file = 'C:\\PythonData\\AcctData.txt' t0 = time.time() pdata = [] # Parse data acct = [] # Account Data row = {} #Assembly Container #Set dataframe columns df = pd.DataFrame(columns=['Account','Name','Type','Flag','Counter','CNT01','CNT02','CNT03']) # open the file and read through it line by line with open(file, 'r') as f: for line in f: #Strip each line pdata = [x.strip() for x in line.split(',')] #Use the index to parse data into either acct[] for use on the rows with counter > 2 indx = int(pdata[0]) if indx == 1: acct.clear() acct.append(pdata[1]) acct.append(pdata[2]) elif indx == 2: acct.append(pdata[1]) acct.append(pdata[2]) else: row.clear() row['Account'] = acct[0] row['Name'] = acct[1] row['Type'] = acct[2] row['Flag'] = acct[3] row['Counter'] = pdata[0] row['CNT01'] = pdata[1] row['CNT02'] = pdata[2] row['CNT03'] = pdata[3] if indx > 2: #data.append(row) df = df.append(row, ignore_index=True) t1 = time.time() totalTimeDf = t1-t0 TTDf = '%.3f'%(totalTimeDf) print(TTDf + " Seconds to Complete df: " + i_filepath) print(df)

0.018 Seconds to Complete df: C:\PythonData\AcctData.txt Account Name Type Flag Counter CNT01 CNT02 CNT03 0 'Acct01' 'Freds Autoshop' '3-way-Cntrl' 'Y' 1000 576 686 837 1 'Acct01' 'Freds Autoshop' '3-way-Cntrl' 'Y' 1001 683 170 775 2 'Acct02' 'Daves Tacos' 'centrifugal' 'N' 1000 334 787 143 3 'Acct02' 'Daves Tacos' 'centrifugal' 'N' 1001 749 132 987

import csv from collections import OrderedDict od = OrderedDict() file_name = 'C:\\PythonData\\AcctData.txt' try: csvfile = open(file_name, 'rt') except: print("File not found") csvReader = csv.reader(csvfile, delimiter=",") for row in csvReader: key = row[0] od.setdefault(key,[]).append(row) od

OrderedDict([('1', [['1', "'Acct01'", "'Freds Autoshop'"], ['1', "'Acct02'", "'Daves Tacos'"]]), ('2', [['2', "'3-way-Cntrl'", "'Y'"], ['2', "'centrifugal'", "'N'"]]), ('1000', [['1000', '576', '686', '837'], ['1000', '334', '787', '143']]), ('1001', [['1001', '683', '170', '775'], ['1001', '749', '132', '987']])])

1条回答

网友

1楼 · 发布于 2024-05-13 09:56:48

我不确定这是禁食法还是Python法，但我相信熊猫法可以，因为你需要以一种奇怪的、真正具体的方式每4行迭代一次：

首先导入要使用的库：

import pandas as pd
import numpy as np

因为我没有要加载的文件，所以我只是将其重新创建为一个数组（这部分您需要做一些工作，或者只需将其加载到一个有4列的pandas数据帧即可[就像下一步]）：

data = [[1,'Acct01','Freds Autoshop'],   
[2,'3-way-Cntrl','Y'  ], 
[1000,576,686,837   ],
[1001,683,170,775   ],
[1002,333,44,885   ],
[1003,611183,12,1   ],
[1,'Acct02','Daves Tacos'   ],
[2,'centrifugal','N'   ],
[1000,334,787,143  ] ,
[1001,749,132,987],
[1,'Acct03','Norah Jones'   ],
[2,'undertaker','N'   ],
[1000,323,1,3  ] ,
[1001,311,2,111  ] ,
[1002,95,112,4]]

用上述数据创建了一个dataframe+用numpy的nans（比panda的快）作为占位符创建了新的列。你知道吗

df = pd.DataFrame(data)
df['4']= np.nan
df['5']= np.nan
df['6']= np.nan
df['7']= np.nan
df['8']= np.nan
df.columns = ['idx','Account','Name','Type','Flag','Counter','CNT01','CNT02','CNT3']

制作一个新的df，它将获得每次“AcctXXXX”的apears和下一个父级之前的行数。你知道吗

# Getting the unique "Acct" and their index position into an array
acct_idx_pos = np.array([df[df['Account'].str.contains('Acct').fillna(False)]['Account'].values, df[df['Account'].str.contains('Acct').fillna(False)].index.values])

# Making a df with the transposed array
df_pos = pd.DataFrame(acct_idx_pos.T, columns=['Acct', 'Position'])

# Shifting the values into a new column and filling the last value (nan) with the df length
df_pos['End_position'] = df_pos['Position'].shift(-1)
df_pos['End_position'][-1:] = len(df)

# Making the column we want, that is the number of loops we'll go
df_pos['Position_length'] = df_pos['End_position'] - df_pos['Position']

使用伪数据帧并连接临时数据帧的自定义函数（稍后使用）

def concatenate_loop_dfs(df_temp, df_full, axis=0):
    """
    to avoid retyping the same line of code for every df.
    the parameters should be the temporary df created at each loop and the concatenated DF that will contain all
    values which must first be initialized (outside the loop) as df_name = pd.DataFrame(). """      

    if df_full.empty:
        df_full = df_temp
    else:
        df_full = pd.concat([df_full, df_temp], axis=axis)

    return df_full

创建了一个函数，该函数将循环填充每一行并删除重复的行：

# a complicated loop function
def shorthen_df(df, num_iterations):

    # to not delete original df
    dataframe = df.copy()

    # for the slicing, we need to start at the first row.
    curr_row = 1

    # fill current row's nan values with values from next row
    dataframe.iloc[curr_row-1:curr_row:,3] = dataframe.iloc[curr_row:curr_row+1:,1].values
    dataframe.iloc[curr_row-1:curr_row:,4] = dataframe.iloc[curr_row:curr_row+1:,2].values
    dataframe.iloc[curr_row-1:curr_row:,5] = dataframe.iloc[curr_row+1:curr_row+2:,0].values
    dataframe.iloc[curr_row-1:curr_row:,6] = dataframe.iloc[curr_row+1:curr_row+2:,1].values
    dataframe.iloc[curr_row-1:curr_row:,7] = dataframe.iloc[curr_row+1:curr_row+2:,2].values
    dataframe.iloc[curr_row-1:curr_row:,8] = dataframe.iloc[curr_row+1:curr_row+2:,3].values

    # the "num_iterations-2" is because the first two lines are filled and not replaced 
    # as the next ones will be. So this will vary correctly to each "account"
    for i in range(1, num_iterations-2):
        # Replaces next row with values from previous row
        dataframe.iloc[curr_row+(i-1):curr_row+i:] = dataframe.iloc[curr_row+(i-2):curr_row+(i-1):].values
        dataframe.iloc[curr_row+(i-1):curr_row+i:,5] = dataframe.iloc[curr_row+i+1:curr_row+i+2:,0].values
        dataframe.iloc[curr_row+(i-1):curr_row+i:,6] = dataframe.iloc[curr_row+i+1:curr_row+i+2:,1].values
        dataframe.iloc[curr_row+(i-1):curr_row+i:,7] = dataframe.iloc[curr_row+i+1:curr_row+i+2:,2].values
        dataframe.iloc[curr_row+(i-1):curr_row+i:,8] = dataframe.iloc[curr_row+i+1:curr_row+i+2:,3].values

    # last 2 rows of df
    dataframe = dataframe[0:len(dataframe)-2]

    return dataframe

最后，使用上述两个函数创建虚拟DF，该DF将包含所有“Acct”并为每个DF循环其位置。你知道吗

df_final= pd.DataFrame()
for start, end, iterations in zip(df_pos.Position.values, df_pos.End_position.values, df_pos.Position_length.values):
    df2 = df[start:end]

    df_temp = shorthen_df(df2, iterations)

    df_final = concatenate_loop_dfs(df_temp, df_final)

# Dropping first/unnecessary columns
df_final.drop('idx', axis=1, inplace=True)


# resetting index
df_final.reset_index(inplace=True, drop=True)
df_final

    Account Name    Type    Flag    Counter CNT01   CNT02   CNT3
0   Acct01  Freds Autoshop  3-way-Cntrl Y   1000.0  576 686 837
1   Acct01  Freds Autoshop  3-way-Cntrl Y   1001.0  683 170 775
2   Acct01  Freds Autoshop  3-way-Cntrl Y   1002.0  333 44  885
3   Acct01  Freds Autoshop  3-way-Cntrl Y   1003.0  611183  12  1
4   Acct02  Daves Tacos centrifugal N   1000.0  334 787 143
5   Acct02  Daves Tacos centrifugal N   1001.0  749 132 987
6   Acct03  Norah Jones undertaker  N   1000.0  323 1   3
7   Acct03  Norah Jones undertaker  N   1001.0  311 2   111
8   Acct03  Norah Jones undertaker  N   1002.0  95  112 4

相关问题更多 >

编程相关推荐

热门问题

热门文章