Skip to content

Commit cb792b6

Browse files
perf(client): optimize file structure copying in multipart requests
1 parent 74af533 commit cb792b6

File tree

5 files changed

+151
-78
lines changed

5 files changed

+151
-78
lines changed

src/gitpod/_files.py

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
import io
44
import os
55
import pathlib
6-
from typing import overload
7-
from typing_extensions import TypeGuard
6+
from typing import Sequence, cast, overload
7+
from typing_extensions import TypeVar, TypeGuard
88

99
import anyio
1010

@@ -17,7 +17,9 @@
1717
HttpxFileContent,
1818
HttpxRequestFiles,
1919
)
20-
from ._utils import is_tuple_t, is_mapping_t, is_sequence_t
20+
from ._utils import is_list, is_mapping, is_tuple_t, is_mapping_t, is_sequence_t
21+
22+
_T = TypeVar("_T")
2123

2224

2325
def is_base64_file_input(obj: object) -> TypeGuard[Base64FileInput]:
@@ -121,3 +123,51 @@ async def async_read_file_content(file: FileContent) -> HttpxFileContent:
121123
return await anyio.Path(file).read_bytes()
122124

123125
return file
126+
127+
128+
def deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]]) -> _T:
129+
"""Copy only the containers along the given paths.
130+
131+
Used to guard against mutation by extract_files without copying the entire structure.
132+
Only dicts and lists that lie on a path are copied; everything else
133+
is returned by reference.
134+
135+
For example, given paths=[["foo", "files", "file"]] and the structure:
136+
{
137+
"foo": {
138+
"bar": {"baz": {}},
139+
"files": {"file": <content>}
140+
}
141+
}
142+
The root dict, "foo", and "files" are copied (they lie on the path).
143+
"bar" and "baz" are returned by reference (off the path).
144+
"""
145+
return _deepcopy_with_paths(item, paths, 0)
146+
147+
148+
def _deepcopy_with_paths(item: _T, paths: Sequence[Sequence[str]], index: int) -> _T:
149+
if not paths:
150+
return item
151+
if is_mapping(item):
152+
key_to_paths: dict[str, list[Sequence[str]]] = {}
153+
for path in paths:
154+
if index < len(path):
155+
key_to_paths.setdefault(path[index], []).append(path)
156+
157+
# if no path continues through this mapping, it won't be mutated and copying it is redundant
158+
if not key_to_paths:
159+
return item
160+
161+
result = dict(item)
162+
for key, subpaths in key_to_paths.items():
163+
if key in result:
164+
result[key] = _deepcopy_with_paths(result[key], subpaths, index + 1)
165+
return cast(_T, result)
166+
if is_list(item):
167+
array_paths = [path for path in paths if index < len(path) and path[index] == "<array>"]
168+
169+
# if no path expects a list here, nothing will be mutated inside it - return by reference
170+
if not array_paths:
171+
return cast(_T, item)
172+
return cast(_T, [_deepcopy_with_paths(entry, array_paths, index + 1) for entry in item])
173+
return item

src/gitpod/_utils/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
coerce_integer as coerce_integer,
2525
file_from_path as file_from_path,
2626
strip_not_given as strip_not_given,
27-
deepcopy_minimal as deepcopy_minimal,
2827
get_async_library as get_async_library,
2928
maybe_coerce_float as maybe_coerce_float,
3029
get_required_header as get_required_header,

src/gitpod/_utils/_utils.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -177,21 +177,6 @@ def is_iterable(obj: object) -> TypeGuard[Iterable[object]]:
177177
return isinstance(obj, Iterable)
178178

179179

180-
def deepcopy_minimal(item: _T) -> _T:
181-
"""Minimal reimplementation of copy.deepcopy() that will only copy certain object types:
182-
183-
- mappings, e.g. `dict`
184-
- list
185-
186-
This is done for performance reasons.
187-
"""
188-
if is_mapping(item):
189-
return cast(_T, {k: deepcopy_minimal(v) for k, v in item.items()})
190-
if is_list(item):
191-
return cast(_T, [deepcopy_minimal(entry) for entry in item])
192-
return item
193-
194-
195180
# copied from https://github.com/Rapptz/RoboDanny
196181
def human_join(seq: Sequence[str], *, delim: str = ", ", final: str = "or") -> str:
197182
size = len(seq)

tests/test_deepcopy.py

Lines changed: 0 additions & 58 deletions
This file was deleted.

tests/test_files.py

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
import pytest
55
from dirty_equals import IsDict, IsList, IsBytes, IsTuple
66

7-
from gitpod._files import to_httpx_files, async_to_httpx_files
7+
from gitpod._files import to_httpx_files, deepcopy_with_paths, async_to_httpx_files
8+
from gitpod._utils import extract_files
89

910
readme_path = Path(__file__).parent.parent.joinpath("README.md")
1011

@@ -49,3 +50,99 @@ def test_string_not_allowed() -> None:
4950
"file": "foo", # type: ignore
5051
}
5152
)
53+
54+
55+
def assert_different_identities(obj1: object, obj2: object) -> None:
56+
assert obj1 == obj2
57+
assert obj1 is not obj2
58+
59+
60+
class TestDeepcopyWithPaths:
61+
def test_copies_top_level_dict(self) -> None:
62+
original = {"file": b"data", "other": "value"}
63+
result = deepcopy_with_paths(original, [["file"]])
64+
assert_different_identities(result, original)
65+
66+
def test_file_value_is_same_reference(self) -> None:
67+
file_bytes = b"contents"
68+
original = {"file": file_bytes}
69+
result = deepcopy_with_paths(original, [["file"]])
70+
assert_different_identities(result, original)
71+
assert result["file"] is file_bytes
72+
73+
def test_list_popped_wholesale(self) -> None:
74+
files = [b"f1", b"f2"]
75+
original = {"files": files, "title": "t"}
76+
result = deepcopy_with_paths(original, [["files", "<array>"]])
77+
assert_different_identities(result, original)
78+
result_files = result["files"]
79+
assert isinstance(result_files, list)
80+
assert_different_identities(result_files, files)
81+
82+
def test_nested_array_path_copies_list_and_elements(self) -> None:
83+
elem1 = {"file": b"f1", "extra": 1}
84+
elem2 = {"file": b"f2", "extra": 2}
85+
original = {"items": [elem1, elem2]}
86+
result = deepcopy_with_paths(original, [["items", "<array>", "file"]])
87+
assert_different_identities(result, original)
88+
result_items = result["items"]
89+
assert isinstance(result_items, list)
90+
assert_different_identities(result_items, original["items"])
91+
assert_different_identities(result_items[0], elem1)
92+
assert_different_identities(result_items[1], elem2)
93+
94+
def test_empty_paths_returns_same_object(self) -> None:
95+
original = {"foo": "bar"}
96+
result = deepcopy_with_paths(original, [])
97+
assert result is original
98+
99+
def test_multiple_paths(self) -> None:
100+
f1 = b"file1"
101+
f2 = b"file2"
102+
original = {"a": f1, "b": f2, "c": "unchanged"}
103+
result = deepcopy_with_paths(original, [["a"], ["b"]])
104+
assert_different_identities(result, original)
105+
assert result["a"] is f1
106+
assert result["b"] is f2
107+
assert result["c"] is original["c"]
108+
109+
def test_extract_files_does_not_mutate_original_top_level(self) -> None:
110+
file_bytes = b"contents"
111+
original = {"file": file_bytes, "other": "value"}
112+
113+
copied = deepcopy_with_paths(original, [["file"]])
114+
extracted = extract_files(copied, paths=[["file"]])
115+
116+
assert extracted == [("file", file_bytes)]
117+
assert original == {"file": file_bytes, "other": "value"}
118+
assert copied == {"other": "value"}
119+
120+
def test_extract_files_does_not_mutate_original_nested_array_path(self) -> None:
121+
file1 = b"f1"
122+
file2 = b"f2"
123+
original = {
124+
"items": [
125+
{"file": file1, "extra": 1},
126+
{"file": file2, "extra": 2},
127+
],
128+
"title": "example",
129+
}
130+
131+
copied = deepcopy_with_paths(original, [["items", "<array>", "file"]])
132+
extracted = extract_files(copied, paths=[["items", "<array>", "file"]])
133+
134+
assert extracted == [("items[][file]", file1), ("items[][file]", file2)]
135+
assert original == {
136+
"items": [
137+
{"file": file1, "extra": 1},
138+
{"file": file2, "extra": 2},
139+
],
140+
"title": "example",
141+
}
142+
assert copied == {
143+
"items": [
144+
{"extra": 1},
145+
{"extra": 2},
146+
],
147+
"title": "example",
148+
}

0 commit comments

Comments
 (0)