在Python中基于较小的数据集生成更大的合成数据集

import numpy as np from random import randrange, choice from sklearn.neighbors import NearestNeighbors import pandas as pd #referring to https://stats.stackexchange.com/questions/215938/generate-synthetic-data-to-match-sample-data df = pd.read_pickle('df_saved.pkl') df = df.iloc[:,:-1] # this gives me df, the final Dataframe which I would like to generate a larger dataset based on. This is the smaller Dataframe with 21000x102 dimensions. def SMOTE(T, N, k): # """ # Returns (N/100) * n_minority_samples synthetic minority samples. # # Parameters # ---------- # T : array-like, shape = [n_minority_samples, n_features] # Holds the minority samples # N : percetange of new synthetic samples: # n_synthetic_samples = N/100 * n_minority_samples. Can be < 100. # k : int. Number of nearest neighbours. # # Returns # ------- # S : array, shape = [(N/100) * n_minority_samples, n_features] # """ n_minority_samples, n_features = T.shape if N < 100: #create synthetic samples only for a subset of T. #TODO: select random minortiy samples N = 100 pass if (N % 100) != 0: raise ValueError("N must be < 100 or multiple of 100") N = N/100 n_synthetic_samples = N * n_minority_samples n_synthetic_samples = int(n_synthetic_samples) n_features = int(n_features) S = np.zeros(shape=(n_synthetic_samples, n_features)) #Learn nearest neighbours neigh = NearestNeighbors(n_neighbors = k) neigh.fit(T) #Calculate synthetic samples for i in range(n_minority_samples): nn = neigh.kneighbors(T[i], return_distance=False) for n in range(N): nn_index = choice(nn[0]) #NOTE: nn includes T[i], we don't want to select it while nn_index == i: nn_index = choice(nn[0]) dif = T[nn_index] - T[i] gap = np.random.random() S[n + i * N, :] = T[i,:] + gap * dif[:] return S df = df.to_numpy() new_data = SMOTE(df,50,10) # this is where I call the function and expect new_data to be generated with larger number of samples than original df.

3条回答

网友

1楼 · 编辑于 2024-05-23 16:30:13

我也有同样的问题。我研究了一段时间，但找不到合适的解决方案，于是我尝试应用我自己的解决方案来解决这个问题。它帮助了我，我希望它能对所有有同样问题的人起作用。在

columns = df.columns.to_numpy()
iteration_count = 30
new_df = pd.DataFrame(columns=columns)

for i in range(iteration_count):
    for k in df.iterrows():
        data_obj = {}
        for j in range(columns.size):
            random_index = np.random.randint(0,13, dtype='int')
            data_obj[columns[j]] = df.loc[random_index][columns[j]]
        new_df = new_df.append(data_obj, ignore_index=True)

df = df.append(new_df, ignore_index=True)

网友

2楼 · 编辑于 2024-05-23 16:30:13

可能对你有用

SMOTE and other advanced over_sampling techniques

这个包imblearn具有类似sklearn的API和许多过采样技术。在

网友

3楼 · 编辑于 2024-05-23 16:30:13

所以T[i]给出的是一个形状为（102，）的数组。在

函数期望的是一个形状为（1102）的数组。在

您可以通过调用“重塑”来获得：

nn = neigh.kneighbors(T[i].reshape(1, -1), return_distance=False)

以防你不熟悉np.重塑，1表示第一个维度的大小应该是1，而-1表示第二个维度应该是任何大小的numpy可以将其广播到的内容；在本例中，是原始的102。在

相关问题更多 >

编程相关推荐

热门问题

热门文章