基于完整的数据帧对一个不完整的数据帧进行排序

x2y = []. no_label = [] for i in df_x.index: a = df_x[i:i+1] #receives one line of df_x at a time. a = a.loc[:, (a != 0).any(axis=0)] #excludes the zeros (unfilled parts of the puzzle) match = True for j in df_y.index: #loops over all lines of df_y for a_i in a: if (a[0:1][a_i].item() != df_y[j:j+1][a_i].item()): match = False #if one element is not present in the final solution, than it goes to the next line in df_y break if match: x2y.append((i,j)) df_y[i:i+1] = df_y[j:j+1] #replace label at the position of interest break if not match: no_label.append(i) #unsolved puzzles with no label

df_x.head() Out[58]: 0 1 2 3 4 5 ... 75 76 77 78 79 80 0 0.0 0.0 0.0 0.0 0.0 168.0 ... 0.0 0.0 886.0 0.0 0.0 973.0 1 0.0 0.0 0.0 0.0 0.0 168.0 ... 0.0 0.0 886.0 899.0 0.0 973.0 2 0.0 0.0 0.0 0.0 0.0 168.0 ... 0.0 0.0 886.0 899.0 0.0 973.0 3 0.0 0.0 0.0 0.0 0.0 168.0 ... 0.0 0.0 886.0 899.0 0.0 973.0 4 0.0 0.0 0.0 149.0 0.0 168.0 ... 0.0 0.0 886.0 899.0 0.0 973.0 [5 rows x 81 columns] df_y.head() Out[59]: 0 1 2 3 4 ... 76 77 78 79 80 0 112.0 126.0 137.0 149.0 154.0 ... 956.0 961.0 973.0 982.0 997.0 1 112.0 126.0 137.0 149.0 154.0 ... 956.0 961.0 973.0 982.0 997.0 2 112.0 126.0 137.0 149.0 154.0 ... 956.0 961.0 973.0 982.0 997.0 3 112.0 126.0 137.0 149.0 154.0 ... 956.0 961.0 973.0 982.0 997.0 4 112.0 126.0 137.0 149.0 154.0 ... 956.0 961.0 973.0 982.0 997.0 [5 rows x 81 columns]

df_x.head() Out[59]: 0 1 2 3 4 ... 76 77 78 79 80 0 0.0 126.0 0.0 149.0 0.0 ... 0.0 0.0 0.0 0.0 997.0 1 111.0 0.0 0.0 0.0 152.0 ... 953.0 0.0 0.0 984.0 0.0 2 112.0 0.0 137.0 0.0 0.0 ... 0.0 961.0 0.0 0.0 997.0 3 0.0 121.0 0.0 0.0 0.0 ... 0.0 962.0 973.0 984.0 0.0 4 0.0 0.0 133.0 144.0 155.0 ... 956.0 0.0 978.0 0.0 0.0 df_y.head() Out[59]: 0 1 2 3 4 ... 76 77 78 79 80 0 112.0 126.0 137.0 149.0 154.0 ... 956.0 961.0 973.0 982.0 997.0 1 111.0 123.0 139.0 147.0 152.0 ... 955.0 968.0 973.0 984.0 991.0 2 112.0 126.0 137.0 149.0 154.0 ... 956.0 961.0 973.0 982.0 997.0 3 119.0 121.0 138.0 147.0 156.0 ... 959.0 962.0 973.0 984.0 995.0 4 116.0 127.0 133.0 144.0 155.0 ... 956.0 962.0 978.0 989.0 992.0

1条回答

网友

1楼 · 发布于 2024-04-25 14:45:44

欢迎来到pandas！这是一个相当困难的问题，因为它看起来像是你想要做1e5*1e5比较，不管我们做什么都不会很快，所以让我们尽量限制它。首先，尽最大努力合理预期匹配指数将接近。其次，这里有一些代码可以让你的匹配更容易一些

对于两个系列x\u行和y\u行：

> x_row = pd.Series([1, 2, 0, 4])
> y_row = pd.Series([1, 2, 3, 4])
> ((x_row == y_row) | (x_row == 0)).all()
True

最后一行是两个检查之间的按位or（|）：首先，如果每个值与另一系列（T, T, F, T）中的对应值匹配，或者x\u行中的值为零（F F T F）。这两个布尔级数的按位“或”是T T T T，因此.all()是真的

下面是一个在上下文中使用它的例子，并且试图通过在找到匹配项后只从运行中取出一行y_df来限制比较的次数。在理想情况下，这将只运行与您有行数相同的次数

x2y = []
unmatched_x = []
unmatched_y = df_y.index.tolist()
for x_idx, x_row in df_x:
    match = False
    for y_idx in unmatched_y:
        if ((x_row == df_y.loc[y_idx]) | (x_row == 0)).all():
            match = True
            break
    if match:
        unmatched_y.remove(y_idx)
        x2y.append(x_idx, y_idx)
    else:
        unmatched_x.append(x_idx)

如果你认为它们中的大多数匹配，你可以通过跑步来找出那些匹配的

matches = ((df_x == df_y) | (df_x == 0)).all(axis=1)

这可以做同样的事情，但是同时在整个数据帧上。它将返回一系列布尔值，对应于df_x的每一行是否与df_y的对应行匹配。然后你就可以把那些没有的分类了。
df_x[matches]将只是匹配的行，或者df_x[~matches]将是不匹配的行

相关问题更多 >

编程相关推荐

热门问题

热门文章