<p>此代码在时间戳不一定在同一invervals中计算的情况下计算Jaccard相似性<code>O(len(s1)^2 + len(s2)^2)</code>时间复杂度</p>
<pre><code>unix_converted = [(1, 3), (6, 10), (11, 12)]
input_timestamps = [(1, 3), (4, 7)]
def jaccard_index(s1, s2):
def _set_sum(start1, end1, start2, end2):
""" returns sum if there is an overlap and None otherwise """
if start2 <= start1 <= end2:
return start2, max(end1, end2)
if start1 <= start2 <= end1:
return start1, max(end1, end2)
return None # separate sets
def _set_intersection(start1, end1, start2, end2):
""" returns intersection if there is an overlap and None otherwise """
if start2 <= start1 <= end2:
return start1, min(end1, end2)
if start1 <= start2 <= end1:
return start2, min(end1, end2)
return None # separate sets
# Calculate A u B
sum = []
for x, y in s1 + s2:
matched_elem = False
for i, (x2, y2) in enumerate(sum):
set_sum = _set_sum(x, y, x2, y2)
if set_sum is not None:
sum[i] = set_sum
matched_elem = True
break
if not matched_elem:
sum.append((x, y))
# join overlapping timestamps
element_is_joined = [False for _ in sum]
for i, (x, y) in enumerate(sum):
if not element_is_joined[i]:
for j, (x2, y2) in enumerate(sum):
if element_is_joined[j] or i == j:
continue
set_sum = _set_sum(x, y, x2, y2)
if set_sum is not None: # overlap is found
sum[j] = set_sum
element_is_joined[i] = True
break
sum_ = 0
for (x, y), is_joined in zip(sum, element_is_joined):
if not is_joined:
sum_ += y - x
if sum_ == 0:
raise ValueError('Division by zero')
# calculate A ^ B
intersection = 0
for x, y in s1:
for x2, y2 in s2:
set_intersection = _set_intersection(x, y, x2, y2)
if set_intersection is not None:
intersection += set_intersection[1] - set_intersection[0]
return intersection / sum_
print(jaccard_index(unix_converted, input_timestamps)) #outputs 0.333333
</code></pre>