Python中执行稀疏矩阵乘法的最快方法

2024-05-15 08:35:07 发布

您现在位置:Python中文网/ 问答频道 /正文

背景:我正在尝试建立一个实时鼓模拟模型,为此我需要非常快速的矩阵向量积。我的矩阵大小约为5000-10000行/列,其中每行只有6个条目是非零的,因此我倾向于使用稀疏矩阵。我正在使用scipy.sparse模块。迭代如下。你知道吗

Vjk_plus_sparse = Vjk_minus_sparse.transpose()
Vj = Vjk_plus_sparse.dot(constant)
np.put(Vj, Nr, 0.0)
Uj[t] = Uj[t-1] + np.transpose(Vj)/fs
Vj_mat = adj_mat_sparse.multiply(Vj)
Vjk_minus_sparse = Vj_mat-Vjk_plus_sparse.multiply(end_gain)

这里,Vjk_plus_sparseVjk_minus_sparseVj_mat是稀疏CSR矩阵,Vjnumpy数组,Ujnumpy矩阵,其中每行表示Uj(t)end_gain是一个静态numpy阵列,用于抑制振动。你知道吗

问题是:size = 4250一次迭代大约需要3毫秒。最重要的 步骤是最后两行。它们加在一起大约需要2.5毫秒。我需要它在0.1 ms中运行,这将是10倍以上的加速。这是问题可能的最大矢量化范围,我不能并行化,因为我在时间上前进,至少在物理上它不会是准确的。你知道吗

尝试:我尝试摆弄稀疏数据结构,发现它们都是CSR(Compressed sparse Row)的性能最好,其值如上所述。我还尝试用矩阵乘法替换multiply()方法,方法是重复Vj,但这会使时间变慢,因为结果操作将是稀疏*密集操作。你知道吗

如何在python本身中加快这个速度?我也愿意尝试c++,尽管现在迁移将是一个很大的痛苦。另外,既然scipy基本上是基于c语言的,它会不会有那么大的加速?你知道吗

添加了一个完整的可运行示例

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches
import math
from mpl_toolkits import mplot3d
import numpy as np
import scipy.sparse as sp
import scipy.fftpack as spf
import matplotlib.animation as animation
import time

sqrt_3 = 1.73205080757


class Pt:
    def __init__(self,x_0,y_0):
        self.x_0 = x_0
        self.y_0 = y_0
        self.id = -1
        self.neighbours = []
        self.distance = (x_0**2 + y_0**2)**0.5

class Circle:
    def __init__(self,radius,center):
        self.radius = radius
        self.center = center
        self.nodes = []

    def construct_mesh(self, unit):
        queue = [self.center]
        self.center.distance = 0
        curr_id = 0
        delta = [(1.,0.), (1./2, (3**0.5)/2),(-1./2, (3**0.5)/2),(-1.,0.), (-1./2,-(3**0.5)/2), (1./2,- (3**0.5)/2)]
        node_dict = {}
        node_dict[(self.center.x_0,self.center.y_0)] = curr_id
        self.nodes.append(self.center)
        curr_id+=1
        while len(queue)!=0:
            curr_pt = queue[0]
            queue.pop(0)
            # self.nodes.append(curr_pt)
            # curr_id+=1
            for i in delta:

                temp_pt = Pt(curr_pt.x_0 + 2*unit*i[0], curr_pt.y_0 + 2*unit*i[1])
                temp_pt.id = curr_id
                temp_pt.distance = (temp_pt.x_0 ** 2 + temp_pt.y_0 ** 2)**0.5          
                # curr_id+=1
                if (round(temp_pt.x_0,5), round(temp_pt.y_0,5)) not in node_dict and temp_pt.distance <= self.radius:
                    # print(temp_pt.x_0, temp_pt.y_0)
                    self.nodes.append(temp_pt)
                    node_dict[(round(temp_pt.x_0,5), round(temp_pt.y_0,5))] = curr_id
                    curr_id+=1
                    queue.append(temp_pt)
                    curr_pt.neighbours.append(temp_pt.id)

                elif temp_pt.distance <= self.radius:
                    curr_pt.neighbours.append(node_dict[round(temp_pt.x_0,5), round(temp_pt.y_0,5)])

        # print(node_dict)

    def plot_neighbours(self, pt):
        x = []
        y = []
        x.append(pt.x_0)
        y.append(pt.y_0)
        for i in (pt.neighbours):
            x.append(self.nodes[i].x_0)
            y.append(self.nodes[i].y_0)
        plt.scatter(x,y)
        plt.axis('scaled')

    def boundary_node_ids(self):
        boundary_nodes = []
        for j in range(len(self.nodes)):
            if(len(self.nodes[j].neighbours) < 6):
                boundary_nodes.append(j)
        return boundary_nodes

    def add_rim(self, boundary_node_ids, unit):
        c = self.center
        rim_ids = []
        N = len(self.nodes)
        for i in range(len(boundary_node_ids)):
            d = self.nodes[boundary_node_ids[i]].distance
            xp = self.nodes[boundary_node_ids[i]].x_0
            yp = self.nodes[boundary_node_ids[i]].y_0
            xnew = xp + xp*unit/d
            ynew = yp + yp*unit/d
            new_point = Pt(xnew, ynew)
            new_point.id = N + i
            rim_ids.append(N+i)
            self.nodes.append(new_point)
            self.nodes[boundary_node_ids[i]].neighbours.append(new_point.id)
            self.nodes[N+i].neighbours.append(boundary_node_ids[i])
        return rim_ids

def find_nearest_point(mesh, pt):
    distances_from_center = np.zeros(len(mesh.nodes))
    for i in xrange(len(mesh.nodes)):
        distances_from_center[i] = mesh.nodes[i].distance
    target_distance = pt.distance
    closest_point_id = np.argmin(np.abs(distances_from_center-target_distance))
    return closest_point_id

def init_impulse(mesh, impulse,  Vj, poi, roi):
    data = []   
    for i in range(len(Vj)):
        r = ((mesh.nodes[i].x_0 - mesh.nodes[poi].x_0)**2 + (mesh.nodes[i].y_0 - mesh.nodes[poi].y_0)**2)**0.5
        Vj[i] = max(0, impulse*(1. - (r/roi)))
        if i in Nr:
            Vj[i] = 0.
        for k in mesh.nodes[i].neighbours:
            data.append(np.asscalar(Vj[i])/2.)

    return Vj, data


r = 0.1016                                 #Radius of drum head
# rho = 2500                          #Density of drum head
thickness = 0.001                       #Thickness of membrane
# tension = 1500                     #Tension in membrane in N
param = 0.9
c = (param/thickness)**(0.5)    #Speed of wave in string
duration = 0.25
fs = 4000
delta = c/fs

center = Pt(0,0)
point_of_impact = Pt(r/2., 0)
center.id = 0
mesh = Circle(r,center)
mesh.construct_mesh(delta)
N = len(mesh.nodes)

Nb = []
for j in range(N):
    if len(mesh.nodes[j].neighbours) < 6:
        Nb.append(j)

Nr = mesh.add_rim(Nb, delta)

N = len(mesh.nodes)
print(N)

row_ind = []
col_ind = []

for j in range(N):
    for k in mesh.nodes[j].neighbours:
        row_ind.append(j)
        col_ind.append(k)

data = np.ones(len(col_ind))

adj_mat_sparse = sp.csr_matrix((data, (row_ind, col_ind)), shape = (N,N))

Vjk_plus = sp.csr_matrix([N, N])
Vj = np.zeros([N,1])
Uj = np.zeros([int(duration*fs), N])
Vj_mat = sp.csc_matrix([N,N])

closest_point_id = find_nearest_point(mesh, point_of_impact)
Vj, Vjk_data = init_impulse(mesh, -10.0, Vj, closest_point_id, r/10.)
Vjk_minus_sparse = sp.csr_matrix((Vjk_data, (row_ind, col_ind)), shape = (N,N))
constant = (1./3)*np.ones([N,1])


Vjk_plus = Vjk_minus_sparse.transpose()
np.put(Vj, Nr, 0.0)
Uj[1] = Uj[0] + np.transpose(Vj)/fs
Vj_mat = adj_mat_sparse.multiply(Vj)
Vjk_minus_sparse = Vj_mat - Vjk_plus

end_gain = np.ones([N,1])
end_gain[Nr] = 1.0          

for t in range(2,int(duration*fs)): 
    Vjk_plus = Vjk_minus_sparse.transpose()
    Vj = Vjk_plus.dot(constant)
    np.put(Vj, Nr, 0.0)
    Uj[t] = Uj[t-1] + np.transpose(Vj)/fs
    Vj_mat = adj_mat_sparse.multiply(Vj)
    Vjk_minus_sparse = Vj_mat-Vjk_plus.multiply(end_gain)

Tags: inselfptidnodenptempnodes