Jaccard Index between timestamps in Python

Jaccard Index between timestamps in Python - python

I have UNIX timestamps being converted to strings as well as given time string inputs that I need to get Jaccard index from. Following are stored in 2D arrays as time intervals.
unix_converted = [['00:00:00', '00:00:03'], ['00:00:03', '00:00:06'], ['00:00:12', '00:00:15']]
input_timestamps = [['00:00:00', '00:00:03'], ['00:00:03', '00:00:06'], ['00:00:06', '00:00:09']]
def jaccard_index(s1, s2):
raise NotImplementedError
Do I have to convert these intervals to datetime objects or there is a way to be a straightforward? And how to get index itself?

You could exploit Python's native support for sets to calculate your Jaccard Index.
unix_converted = [['00:00:00', '00:00:03'], ['00:00:03', '00:00:06'], ['00:00:12', '00:00:15']]
input_timestamps = [['00:00:00', '00:00:03'], ['00:00:03', '00:00:06'], ['00:00:06', '00:00:09']]
def jaccard_index(s1, s2):
s1 = set({'-'.join(each) for each in s1})
s2 = set({'-'.join(each) for each in s2})
return len(s1.intersection(s2))/len(s1.union(s2))
print(jaccard_index(unix_converted, input_timestamps)) #outputs 0.5
Edit: I'm assuming by Jaccard Index you meant Jaccard similarity i.e. intersection over union of the given lists.

This code calculates Jaccard similarity in situations where timestamps are not necessarily calculated in the same invervals. O(len(s1)^2 + len(s2)^2) time complexity.
unix_converted = [(1, 3), (6, 10), (11, 12)]
input_timestamps = [(1, 3), (4, 7)]
def jaccard_index(s1, s2):
def _set_sum(start1, end1, start2, end2):
""" returns sum if there is an overlap and None otherwise """
if start2 <= start1 <= end2:
return start2, max(end1, end2)
if start1 <= start2 <= end1:
return start1, max(end1, end2)
return None # separate sets
def _set_intersection(start1, end1, start2, end2):
""" returns intersection if there is an overlap and None otherwise """
if start2 <= start1 <= end2:
return start1, min(end1, end2)
if start1 <= start2 <= end1:
return start2, min(end1, end2)
return None # separate sets
# Calculate A u B
sum = []
for x, y in s1 + s2:
matched_elem = False
for i, (x2, y2) in enumerate(sum):
set_sum = _set_sum(x, y, x2, y2)
if set_sum is not None:
sum[i] = set_sum
matched_elem = True
break
if not matched_elem:
sum.append((x, y))
# join overlapping timestamps
element_is_joined = [False for _ in sum]
for i, (x, y) in enumerate(sum):
if not element_is_joined[i]:
for j, (x2, y2) in enumerate(sum):
if element_is_joined[j] or i == j:
continue
set_sum = _set_sum(x, y, x2, y2)
if set_sum is not None: # overlap is found
sum[j] = set_sum
element_is_joined[i] = True
break
sum_ = 0
for (x, y), is_joined in zip(sum, element_is_joined):
if not is_joined:
sum_ += y - x
if sum_ == 0:
raise ValueError('Division by zero')
# calculate A ^ B
intersection = 0
for x, y in s1:
for x2, y2 in s2:
set_intersection = _set_intersection(x, y, x2, y2)
if set_intersection is not None:
intersection += set_intersection[1] - set_intersection[0]
return intersection / sum_
print(jaccard_index(unix_converted, input_timestamps)) #outputs 0.333333

Related

How to generate random vertices that form tetrahedrons?

I am trying to develop a random tetrahedron generator that can take a list of four coordinates and produce a tetrahedron. Currently, I am only able to plot a tetrahedron using four pre-determined points. Here is the code I have as of now:
def rand_tetrahedron_generator(bounds, min_len, max_len):
"""
bounds: List - max length in each dimension
min_len: int - minimum length of tetrahedron
max_len: int - maximum length of tetrahedron
"""
assert len(bounds) == 3
assert min_len <= max_len
max_len = min(max_len, bounds[0], bounds[1], bounds[2])
bounds = np.array(bounds)
p1 = np.random.randint(low=0, high=bounds, size=3)
p2 = np.random.randint(low=0, high=bounds - min_len, size=3)
p3 = np.random.randint(low=0, high=p1+p2, size=3)
p4 = np.random.randint(low=0, high=p1+p2, size=3)
points = np.array([p1,p2,p3,p4])
center = np.mean(points, axis=0)
x, y, z = (np.indices((60, 60, 60))-np.array([20,25,25]).reshape(-1,1,1,1))/8
mx = midpoints(x)
my = midpoints(y)
mz = midpoints(z)
conditions = []
for p1,p2,p3 in itertools.combinations(points, 3):
a, n = surface_normal_form(p1,p2,p3)
conditions.append((mx-a[0])*n[0]+(my-a[1])*n[1]+(mz-a[2])*n[2] <= 0)
simplex = conditions[0] & conditions[1] & conditions[2] & conditions[3]
return simplex
def surface_normal_form(a,b,c):
v = b-a
w = c-b
n = np.cross(v,w)
#normal needs to point out
if (center-a)#n > 0:
n *= -1
return a, n
def midpoints(x):
sl = ()
for i in range(x.ndim):
x = (x[sl + np.index_exp[:-1]] + x[sl + np.index_exp[1:]]) / 2.0
sl += np.index_exp[:]
return x
I believe that the way I generate p1, p2, p3, and p4 is incorrect because the function sometimes generates points that are unable to form a tetrahedron. I would greatly appreciate any advice on how to solve this issue. I have also attached an image of the final result I am looking for.

Split intervals longer than a threshold

I have a list of tuples, each defining an interval (start, end).
I would like to split the intervals which are longer than a certain threshold.
Example:
Initial list: segs = [(0,100),(120,140),(160,200)]
Threshold: 30
Desired output:
split_segs = [(0,30),(30,60),(60,90),(90,100),(120,140),(160,190),(190,200)]
I come up with this code.
thr = 30.
split_segs = []
for a,b in segs:
if b-a < thr:
split_segs.extend([(a,b)])
else:
n = int((b-a)/thr)
for i in range(n):
if b-(a + (i+1)*thr) < thr:
split_segs.extend([(a+(i+1)*thr, b)])
else:
split_segs.extend([(a+i*thr, a+(i+1)*thr)])
It works but looks very clumsy to me. Any better or more pythonic solution?

You can do this slightly more elegantly by extending with a range that has a step of threshold:
segs = [(0,100),(120,140),(160,200)]
threshold = 30
split_segs = []
for seg in segs:
(a, b) = seg
diff = b - a
if diff <= threshold:
split_segs.append(seg)
else:
split_segs.extend((n - threshold, n) for n in range(a + threshold, b + 1, threshold))
if diff % threshold:
# complete the gap
split_segs.append((b - diff % threshold, b))
print(split_segs)

This is a recursive solution for your problem:
segs = [(0,100),(120,140),(160,200)]
threshold = 30
def divide(to_divide):
divided = []
if to_divide[1] - to_divide[0] > threshold:
divided.append((to_divide[0], to_divide[0] + threshold))
divided.extend(divide((to_divide[0] + threshold, to_divide[1])))
return divided
else:
return [to_divide]
divided = [el for x in segs for el in divide(x)]
print(divided)
The output will be:
[(0, 30), (30, 60), (60, 90), (90, 100), (120, 140), (160, 190), (190, 200)]
UPDATE: if you prefere a non-recursive solution, this is a possible one:
segs = [(0,100),(120,140),(160,200)]
threshold = 30
def divide(to_divide):
divided = []
divided.extend((to_divide[0] + i * threshold, to_divide[0] + (i+1) * threshold) for i in range((to_divide[1] - to_divide[0]) // threshold))
if divided:
if divided[-1][1] != to_divide[1]:
divided.append((divided[-1][1], to_divide[1]))
else:
divided.append((to_divide[0], to_divide[1]))
return divided
divided = [el for x in segs for el in divide(x)]
print(divided)

Check if a set of points described a triangle

I tried to solve this question but couldn't find a simple solution without passing all rows and find which numbers are on the same line.
Is there a simple way to find triangles?
this is my solution for finding a triangle:
How can I change it to be more "pythonic"? (or even better method for solving it)
from sympy.solvers import solve
from sympy import Symbol
from collections import Counter
vals = [8,17,19] # the triangle
dicl = [] #list of dics
for v in vals:
dic = {}
dic['val'] = v
v1 = v
done = 0
stepsb = 0
while done == 0: #going backword untill reaching the big triabgle edges
x = Symbol('x')
k = solve((x**2 + x)/2 +1 - v1, x)
k = list(filter(lambda x:x>0, k))
if k[0]%1 == 0:
done = 1
else:
v1 -= 1
stepsb += 1
dic['line'] = k[0]
dic['stepsb'] = stepsb #dist from the left edge
dic['stepsf'] = (k[0]**2 + 3*k[0] + 2)/2 - v #dist from the right edge
dicl.append(dic)
print(dic)
lines = [l['line'] for l in dicl]
mc = Counter(lines).most_common(1)[0][0] #finding the numbers on the same line
minv = min([l['val'] for l in dicl if l['line'] == mc])
maxv = max([l['val'] for l in dicl if l['line'] == mc])
stb = [l['stepsb'] for l in dicl if l['val'] == minv][0]
stf = [l['stepsf'] for l in dicl if l['val'] == maxv][0]
for k in dicl:
if k['stepsb'] == stb and k['stepsf'] == stf:
print("good")
break

A first step could be to search for a formula that translates the one-dimensional point number t to an x,y coordinate.
So, search for an n such that n*(n+1)/2 < t:
from sympy import solve, Eq
from sympy.abc import n, t
f = Eq(n * (n + 1), 2 * t)
print(solve(f, n))
This shows as positive root: (sqrt(8*t + 1) - 1)/2.
To be strict smaller, a formula that copes with small approximation errors, could be:
floor((sqrt(8*t + 1) - 1)/2 - 0.0000001
The following idea is, given a list of indices:
convert them to xy coordinates
find their center (sum and divide by the length of the list)
find the distances of each xy to the center
check that all distances are equal
To convert to an xy position, note that the height of an equilateral triangle with base 1 is sqrt(3)/2, so the distances between the y-positions should be multiplied by that factor. The x-positions need to be centered which can be achieved by subtracting n/2.
import math
def find_xy(t):
# convert the numerical position into an xy coordinate in the plane
# first find largest n such that n*(n+1)/2 < t
n = math.floor((math.sqrt(8 * t + 1) - 1) / 2 - 0.0000001)
return (n + 1) * math.sqrt(3) / 2, t - n * (n + 1) // 2 - n/2
def sq_dist(p, q):
return (p[0] - q[0]) ** 2 + (p[1] - q[1]) ** 2
def center(points):
# find the center of a list of points
l = len(points)
x = sum(p[0] for p in points)
y = sum(p[1] for p in points)
return x / l, y / l
def is_regular(tri_points):
points = [find_xy(t) for t in tri_points]
cent = center(points)
dists = [sq_dist(cent, p) for p in points]
return max(dists) - min(dists) < 0.000001
Note that this code finds geometric figures for which all the points lie on a circle. This doesn't work for the parallelogram. The actual question also has some extra criteria: all edges should follow the grid lines, and all edges need to be equal in length.
Therefore, it is useful to have 3 coordinates for each point: the row, the column and the diagonal (the 3 directions of the grid).
The length in each direction, is just the maximum minus the minimum for that direction. These lengths are called d_r, d_c and d_d in the code below.
Checking for a valid triangle, the 3 lengths need to be equal. One way to check this, is to check that the minimum of the lengths is equal to the maximum.
For a valid parallelogram, two lengths need to be equal, and the third should be the double. Checking that the maximum length is twice the minimum length should cover this. But, because this can already be reached using 3 points, we should also check that for a given direction, there are exactly 2 points at the minimum and 2 at the maximum. Summing all points and comparing twice the sum of maximum and minimum should accomplish this.
For a valid hexagon, the 3 lengths should be equal. So, the same test as for the triangle: the minimum of the lengths equal to the maximum. And also the test on the sums is needed, as 4 points can already fulfil the length conditions.
import math
def find_row_col_diag(t):
# convert the numerical position into an row,col,diag coordinate in the plane
# first find largest n such that n*(n+1)/2 < t
n = math.floor((math.sqrt(8 * t + 1) - 1) / 2 - 0.0000001)
row, col = n + 1, t - n * (n + 1) // 2
return row, col, row - col
def check_valid_figure(tri_points):
points = [find_row_col_diag(t) for t in tri_points]
rs = [r for (r, c, d) in points]
cs = [c for (r, c, d) in points]
ds = [d for (r, c, d) in points]
sum_r = sum(rs)
min_r = min(rs)
max_r = max(rs)
d_r = max_r - min_r
sum_c = sum(cs)
min_c = min(cs)
max_c = max(cs)
d_c = max_c - min_c
sum_d = sum(ds)
min_d = min(ds)
max_d = max(ds)
d_d = max_d - min_d
if len(points) == 3:
is_ok = max(d_r, d_c, d_d) == min(d_r, d_c, d_d)
elif len(points) == 4:
is_ok = max(d_r, d_c, d_d) == 2 * min(d_r, d_c, d_d) \
and sum_r == 2 * (min_r + max_r) and sum_c == 2 * (min_c + max_c) and sum_d == 2 * (min_d + max_d)
elif len(points) == 6:
is_ok = max(d_r, d_c, d_d) == min(d_r, d_c, d_d) \
and len(set(rs)) == 3 and len(set(cs)) == 3 and len(set(ds)) == 3
else:
is_ok = False
print(" ".join([str(t) for t in tri_points]), end=" ")
if is_ok:
print("are the vertices of a",
"triangle" if len(points) == 3 else "parallelogram" if len(points) == 4 else "hexagon")
else:
print("are not the vertices of an acceptable figure")
tri_point_lists = [[1, 2, 3],
[11, 13, 22, 24],
[11, 13, 29, 31],
[11, 13, 23, 25],
[26, 11, 13, 24],
[22, 23, 30],
[4, 5, 9, 13, 12, 7]]
for lst in tri_point_lists:
check_valid_figure(lst)
The last code can be further compressed using list comprehensions:
def check_valid_figure_bis(tri_points):
points = [find_row_col_diag(t) for t in tri_points]
rs, cs, ds = [[p[i] for p in points] for i in range(3)]
sums = [sum(xs) for xs in (rs, cs, ds)]
mins = [min(xs) for xs in (rs, cs, ds)]
maxs = [max(xs) for xs in (rs, cs, ds)]
lens = [ma - mi for mi, ma in zip(mins, maxs)]
if len(points) == 3:
is_ok = max(lens) == min(lens)
elif len(points) == 4:
is_ok = max(lens) == 2 * min(lens) and all([su == 2 * (mi + ma) for su, mi, ma in zip(sums, mins, maxs)])
elif len(points) == 6:
is_ok = max(lens) == min(lens) and all([len(set(xs)) == 3 for xs in (rs, cs, ds)])
else:
is_ok = False
return is_ok

Check if all points in a set lie on the same line

I'm given a list with coordinates of n points, let's say:
points = [(1, 2), (2, 3), (3, 4)]
And I need to check if all of them lie on the same line. I also decided to consider 3 cases to avoid dividing by zero when x1 == x2.
So here's my code in Python:
# 2 points always lie on a line
if n <= 2:
print("yes")
else:
# leave only unique points
points = list(set(points))
x1, y1 = points[0]
x2, y2 = points[1]
# if two points have the same x coordinate
# then they lie on a vertical line and
# all other points have to have the same x coordinate too
if x2 == x1:
for i in range(2, len(points)):
if points[i][0] != x1:
print("no")
break
else: print("yes")
# same with a horizontal line
elif y2 == y1:
for i in range(2, len(points)):
if points[i][1] != y1:
print("no")
break
else: print("yes")
else:
a = (y2-y1)/(x2-x1)
b = y2 - a * x2
for i in range(2, len(points)):
x, y = points[i]
if (y != a * x + b):
print("no")
break
else: print("yes")
It seems that I have a mistake somewhere in the code but I don't really understand what it is.

Using cross product of vectors eliminates the complexity of having to deal with special cases where division by zero might happen. Three points are collinear if the cross product of the vectors formed by the two vectors defined by the 3 points is equal to zero:
import math
class Point:
def __init__(self, x, y):
self.x = x
self.y = y
def __sub__(self, other):
return Vector(self.x - other.x, self.y - other.y)
class Vector:
def __init__(self, x, y):
self.x = x
self.y = y
def cross(self, other):
return self.x * other.y - self.y * other.x
def are_collinear(three_points):
a, b, c = three_points
# better use math.isclose than == to check for floats
return math.isclose((b-a).cross(c-a), 0.0)
points = [Point(1, 2), Point(2, 3), Point(3, 4)]
print(are_collinear(points))
# True
points = [Point(1, 2), Point(3, 3), Point(3, 4)]
print(are_collinear(points))
# False

From any point in the list (e.g. first one) if all other points have the same slope with that one, then they are on the same line.
def sameLine(points):
x0,y0 = points[0]
points = [ (x,y) for x,y in points if x != x0 or y != y0 ] # Other points
slopes = [ (y-y0)/(x-x0) if x!=x0 else None for x,y in points ] # None for vertical Line
return all( s == slopes[0] for s in slopes)

Is there a better way to find points along a curve than Bresenham's line algorithm

I have a travel time map, I want to get the integer points along the shortest path from source to receiver.
My present solution is that I make a runge-kutta integration from the receiver location and get a series of float points. Then I sample every 5 or some number of points and assume it a straight line between in order to use the Bresenham's line algorithm. With this approach, I will get the integer points.
However, it's not enough fast. Because I need to calculate a lot of receivers' shortest path, the sum of time will be very large.
I used line_profiler to analysis the time-consuming, which shows the major part of time is for function ruge-kutta and its calling function get_velocity
codes are below
def optimal_path_2d(gradx_interp,
grady_interp,
starting_point,
dx,
N=100):
"""
Find the optimal path from starting_point to the zero contour
of travel_time. dx is the grid spacing
Solve the equation x_t = - grad t / | grad t |
"""
def get_velocity(position):
""" return normalized velocity at pos """
x, y = position
vel = np.array([gradx_interp(y, x)[0][0], grady_interp(y, x)[0][0]])
return vel / np.linalg.norm(vel)
def runge_kutta(pos, ds):
""" Fourth order Runge Kutta point update """
k1 = ds * get_velocity(pos)
k2 = ds * get_velocity(pos - k1 / 2.0)
k3 = ds * get_velocity(pos - k2 / 2.0)
k4 = ds * get_velocity(pos - k3)
return pos - (k1 + 2 * k2 + 2 * k3 + k4) / 6.0
x = runge_kutta(starting_point, dx)
xl, yl = [], []
for i in range(N):
xl.append(x[0])
yl.append(x[1])
x = runge_kutta(x, dx)
distance = ((x[0] - xl[-1])**2 +
(x[1] - yl[-1])**2)**0.5
if distance < dx*0.9:
break
return yl, xl
def get_curve(x_curve, y_curve, num_interval):
"""Curve Algorithm based on Bresenham's Line Algorithm
Produces a list of tuples
"""
num = len(x_curve)
if num < num_interval:
print("num_interval is too large.")
ret_set = set()
x0 = x_curve[0]
y0 = y_curve[0]
for i in range(num_interval, num, num_interval):
x1 = x_curve[i]
y1 = y_curve[i]
points_on_line = get_line((x0, y0), (x1, y1))
ret_set.update(points_on_line)
x0 = x1
y0 = y1
if num % num_interval != 0:
n = int(num/num_interval)*num_interval
x0 = x_curve[n]
y0 = y_curve[n]
x1 = x_curve[-1]
y1 = y_curve[-1]
points_on_line = get_line((x0, y0), (x1, y1))
ret_set.update(points_on_line)
return list(ret_set)
def get_line(start, end):
"""modifed version of Bresenham's Line Algorithm
Produces a list of tuples from start and end
>>> points1 = get_line((0, 0), (3, 4))
>>> points2 = get_line((3, 4), (0, 0))
>>> assert(set(points1) == set(points2))
>>> print points1
[(0, 0), (1, 1), (1, 2), (2, 3), (3, 4)]
>>> print points2
[(3, 4), (2, 3), (1, 2), (1, 1), (0, 0)]
"""
# Setup initial conditions
x1, y1 = (int(x) for x in start)
x2, y2 = (int(x) for x in end)
dx = x2 - x1
dy = y2 - y1
# Determine how steep the line is
is_steep = abs(dy) > abs(dx)
# Rotate line
if is_steep:
x1, y1 = y1, x1
x2, y2 = y2, x2
# Swap start and end points if necessary and store swap state
swapped = False
if x1 > x2:
x1, x2 = x2, x1
y1, y2 = y2, y1
swapped = True
# Recalculate differentials
dx = x2 - x1
dy = y2 - y1
# Calculate error
error = int(dx / 2.0)
ystep = 1 if y1 < y2 else -1
# Iterate over bounding box generating points between start and end
y = y1
points = []
for x in range(x1, x2 + 1):
coord = (y, x) if is_steep else (x, y)
points.append(coord)
error -= abs(dy)
if error < 0:
y += ystep
error += dx
# Reverse the list if the coordinates were swapped
if swapped:
points.reverse()
return points
nx = 100
ny = 100
num_interval = 5
loc_src = (10, 10)
loc_rec = (70, 90)
coordx = np.arange(nx)
coordy = np.arange(ny)
X, Y = np.meshgrid(coordx, coords)
travel_time = (X-loc_src[0])**2/5 + (Y-loc_src[1])**2/10 # for simplicity
grad_t_y, grad_t_x = np.gradient(travel_time, dx)
if isinstance(travel_time, np.ma.MaskedArray):
grad_t_y[grad_t_y.mask] = 0.0
grad_t_y = grad_t_y.data
grad_t_x[grad_t_x.mask] = 0.0
grad_t_x = grad_t_x.data
gradx_interp = RectBivariateSpline(coordy, coordx, grad_t_x)
grady_interp = RectBivariateSpline(coordy, coordx, grad_t_y)
yl, xl = optimal_path(gradx_interp, grady_interp, loc_rec, dx)
grid_indx = get_curve(xl, yl, num_interval)
I hear that Cython will be faster, then I learn a little recently and try it. the result is only 2 faster than codes above because I'm really new to Cython. The code below is incomplete, and I just wrote it for testing.
import numpy as np
from numpy.core.umath_tests import inner1d
def func(X_interp, Y_interp):
def get_velocity(double x, double y ):
""" return normalized velocity at pos """
cdef double vel[2], norm
a = X_interp(y, x)
vel[0] = a[0][0]
b = Y_interp(y, x)
vel[1] = b[0][0]
# norm = (vel[0]**2 + vel[1]**2)**0.5
# vel[0] = vel[0]/norm
# vel[1] = vel[1]/norm
return vel
def runge_kutta(double x, double y, double ds):
""" Fourth order Runge Kutta point update """
cdef double k1[2], k2[2], k3[2], k4[2], r[2], pos[2]
pos[0] = x; pos[1] = y
k1 = get_velocity(pos[0], pos[1])
k2 = get_velocity(pos[0] - k1[0]/2.0*ds,pos[1] - k1[1]/2.0*ds)
k3 = get_velocity(pos[0] - k2[0]/2.0*ds,pos[1] - k2[1]/2.0*ds)
k4 = get_velocity(pos[0] - k3[0]/2.0*ds,pos[1] - k3[1]/2.0*ds)
cdef size_t i
for i in range(2):
r[i] = pos[i] - ds * (k1[i] + 2*k2[i] + 2*k3[i] + k4[i])/6.0
return r
for i in range(50):
runge_kutta(0, 0, 1.)
# print(runge_kutta(0, 0, 1.))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Jaccard Index between timestamps in Python - python

Related

How to generate random vertices that form tetrahedrons?

Split intervals longer than a threshold

Check if a set of points described a triangle

Check if all points in a set lie on the same line

Is there a better way to find points along a curve than Bresenham's line algorithm

Categories

Resources