擅长:python、mysql、java
<p>我会像这样把它切成块</p>
<pre><code>from sklearn.metrics.pairwise import cosine_similarity
# Change chunk_size to control resource consumption and speed
# Higher chunk_size means more memory/RAM needed but also faster
chunk_size = 500
matrix_len = your_matrix.shape[0] # Not sparse numpy.ndarray
def similarity_cosine_by_chunk(start, end):
if end > matrix_len:
end = matrix_len
return cosine_similarity(X=your_matrix[start:end], Y=your_matrix) # scikit-learn function
for chunk_start in xrange(0, matrix_len, chunk_size):
cosine_similarity_chunk = similarity_cosine_by_chunk(chunk_start, chunk_start+chunk_size)
# Handle cosine_similarity_chunk ( Write it to file_timestamp and close the file )
# Do not open the same file again or you may end up with out of memory after few chunks
</code></pre>