<p>一个低效的黑客,我会离开这里,以防它帮助别人。欢迎提出其他建议。在</p>
<pre><code>def calculate_cosine_distance():
unique_terms = get_unique_terms_as_list()
tfidf_matrix = [[0 for i in range(len(unique_terms))] for j in range(TOTAL_NUMBER_OF_BOOKS)]
with open(INPUT_FILE_PATH, mode='r') as infile:
reader = csv.reader(infile.read().splitlines(), quoting=csv.QUOTE_NONE)
# Ignore header row
next(reader)
for rows in reader:
book = int(rows[0]) - 1 # To make it a zero-indexed array
term_index = int(unique_terms.index(rows[1]))
tfidf_matrix[book][term_index] = rows[2]
# Calculate distance between book X and book Y
print cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)
def get_unique_terms_as_list():
unique_terms = set()
with open(INPUT_FILE_PATH, mode='rU') as infile:
reader = csv.reader(infile.read().splitlines(), quoting=csv.QUOTE_NONE)
# Skip header
next(reader)
for rows in reader:
unique_terms.add(rows[1])
unique_terms = list(unique_terms)
return unique_terms
</code></pre>