-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__init__.py
140 lines (128 loc) · 4.62 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import pandas as pd
import numpy as np
import numexpr
def find_rectangle_intersections(
df: pd.DataFrame,
columns: tuple | list = ("start_x", "start_y", "end_x", "end_y"),
new_column: str | int | float = "aa_intersecting",
dtype: np.float32 | np.float64 | np.int32 | np.int64 = np.int32,
convert_to_tuples: bool = False,
) -> pd.DataFrame:
"""
Finds rectangle intersections in a DataFrame and populates a new column with results.
The function utilizes NumExpr for calculations, which can significantly improve
performance when dealing with large datasets of rectangles
Args:
df (pd.DataFrame): Input DataFrame containing rectangle coordinates.
columns (tuple or list, optional): Names of columns containing rectangle coordinates
(start_x, start_y, end_x, end_y). Defaults to ("start_x", "start_y", "end_x", "end_y").
new_column (str, int, float, optional): Name of the new column to store intersection results.
Defaults to "aa_intersecting".
dtype (np.float32 | np.float64 | np.int32 | np.int64, optional): Data type for calculations. Defaults to np.int32.
convert_to_tuples (bool, optional): If True, converts intersection results to tuples.
Defaults to False.
Returns:
pd.DataFrame: Input DataFrame with the new_column populated with intersection results.
Example:
import time
import pandas as pd
import numpy as np
min_x = 1
max_x = 100
min_y = 1
max_y = 100
size = 50000
min_width = 1
max_width = 1000
min_height = 1
max_height = 1000
df = pd.DataFrame(
[
(startx := np.random.randint(min_x, max_x, size=size)),
(starty := np.random.randint(min_y, max_y, size=size)),
startx + np.random.randint(min_width, max_width, size=size),
starty + np.random.randint(min_height, max_height, size=size),
]
).T.rename(columns={0: "start_x", 1: "start_y", 2: "end_x", 3: "end_y"})
start = time.perf_counter()
df = find_rectangle_intersections(
df,
columns=("start_x", "start_y", "end_x", "end_y"),
new_column="aa_intersecting",
dtype=np.int32,
convert_to_tuples=False,
)
print(time.perf_counter() - start)
"""
def find_overlaps(rect):
numexpr.evaluate(
"y1 | y2 | y3 | y4",
global_dict={},
local_dict={
"y1": data2smaller[rect[0]],
"y2": data0bigger[rect[2]],
"y3": data1bigger[rect[3]],
"y4": data3smaller[rect[1]],
},
out=tmparray,
casting="no",
)
subresult = datatuples[np.where(tmparray)]
if convert_to_tuples:
return tuple(subresult)
else:
return subresult
datadf = df[[*columns]].astype(dtype)
data = datadf.__array__()
if convert_to_tuples:
datatuples = np.fromiter(map(tuple, data), dtype="object")
else:
datatuples = data
box2_0 = datadf[columns[0]].unique().__array__()
box2_1 = datadf[columns[1]].unique().__array__()
box2_2 = datadf[columns[2]].unique().__array__()
box2_3 = datadf[columns[3]].unique().__array__()
box1_0 = datadf[columns[0]].__array__()
box1_1 = datadf[columns[1]].__array__()
box1_2 = datadf[columns[2]].__array__()
box1_3 = datadf[columns[3]].__array__()
tmparray = np.zeros_like(box1_0).astype(bool)
data2smaller = {
k: numexpr.evaluate(
f"(box1_2 < {k})",
global_dict={},
local_dict={"box1_2": box1_2},
)
for k in box2_0
}
data0bigger = {
k: numexpr.evaluate(
f"(box1_0 > {k})",
global_dict={},
local_dict={"box1_0": box1_0},
)
for k in box2_2
}
data1bigger = {
k: numexpr.evaluate(
f"(box1_1 > {k})",
global_dict={},
local_dict={"box1_1": box1_1},
)
for k in box2_3
}
data3smaller = {
k: numexpr.evaluate(
f"(box1_3 < {k})",
global_dict={},
local_dict={"box1_3": box1_3},
)
for k in box2_1
}
df.loc[:, new_column] = df.apply(
lambda x: find_overlaps(
(x[columns[0]], x[columns[1]], x[columns[2]], x[columns[3]]),
),
axis=1,
)
return df