-
-
Notifications
You must be signed in to change notification settings - Fork 845
Implement chunk comparison and selective extraction for borg extract (#5638) #8632
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -719,6 +719,63 @@ def extract_helper(self, item, path, hlm, *, dry_run=False): | |
| # In this case, we *want* to extract twice, because there is no other way. | ||
| pass | ||
|
|
||
| def compare_and_extract_chunks(self, item, fs_path, *, st, pi=None): | ||
| """Compare file chunks and patch if needed. Returns True if patching succeeded.""" | ||
| if st is None: | ||
| return False | ||
| try: | ||
| # First pass: Build fs chunks list | ||
| fs_chunks = [] | ||
| with backup_io("open"): | ||
| fs_file = open(fs_path, "rb") | ||
| with fs_file: | ||
| for chunk in item.chunks: | ||
| with backup_io("read"): | ||
| data = fs_file.read(chunk.size) | ||
|
|
||
| fs_chunks.append(ChunkListEntry(id=self.key.id_hash(data), size=len(data))) | ||
|
|
||
| # Compare chunks and collect needed chunk IDs | ||
| needed_chunks = [] | ||
| for fs_chunk, item_chunk in zip(fs_chunks, item.chunks): | ||
| if fs_chunk != item_chunk: | ||
| needed_chunks.append(item_chunk) | ||
|
|
||
| # Fetch all needed chunks and iterate through ALL of them | ||
| chunk_data_iter = self.pipeline.fetch_many([chunk.id for chunk in needed_chunks], ro_type=ROBJ_FILE_STREAM) | ||
|
|
||
| # Second pass: Update file | ||
| with backup_io("open"): | ||
| fs_file = open(fs_path, "rb+") | ||
| with fs_file: | ||
| for fs_chunk, item_chunk in zip(fs_chunks, item.chunks): | ||
| if fs_chunk == item_chunk: | ||
| with backup_io("seek"): | ||
| fs_file.seek(item_chunk.size, 1) | ||
| else: | ||
| chunk_data = next(chunk_data_iter) | ||
|
|
||
| with backup_io("write"): | ||
| fs_file.write(chunk_data) | ||
| if pi: | ||
| pi.show(increase=len(chunk_data), info=[remove_surrogates(item.path)]) | ||
|
alighazi288 marked this conversation as resolved.
Outdated
|
||
|
|
||
| final_size = fs_file.tell() | ||
| with backup_io("truncate_and_attrs"): | ||
| fs_file.truncate(item.size) | ||
| fs_file.flush() | ||
| self.restore_attrs(fs_path, item, fd=fs_file.fileno()) | ||
|
alighazi288 marked this conversation as resolved.
Outdated
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you could create a method clear_attrs close to the definition of restore_attrs and move that code you added above to there. |
||
|
|
||
| if "size" in item and item.size != final_size: | ||
| raise BackupError(f"Size inconsistency detected: size {item.size}, chunks size {final_size}") | ||
|
|
||
| if "chunks_healthy" in item and not item.chunks_healthy: | ||
| raise BackupError("File has damaged (all-zero) chunks. Try running borg check --repair.") | ||
|
|
||
| return True | ||
| except OSError: | ||
| return False | ||
|
|
||
| def extract_item( | ||
| self, | ||
| item, | ||
|
|
@@ -802,12 +859,14 @@ def same_item(item, st): | |
| return # done! we already have fully extracted this file in a previous run. | ||
| elif stat.S_ISDIR(st.st_mode): | ||
| os.rmdir(path) | ||
| st = None | ||
| else: | ||
| os.unlink(path) | ||
| st = None | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did you think about this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ThomasWaldmann I didn't realize that this would break the I have tried:
The only fix in my mind is the extra OS call to check the file's existence. Maybe I'm missing something?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the fs file is a normal file, your code requires it to be there, so it can be "updated" - thus it must not be removed.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And we also need to think about what to do with existing metadata, like acls, xattrs, fs flags, ... - the current code assumes that there is no existing metadata and just adds the stuff from the archive item.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ThomasWaldmann another solution I can think of is: try:
st = os.stat(path, follow_symlinks=False)
if continue_extraction and same_item(item, st):
return # done! we already have fully extracted this file in a previous run.
if stat.S_ISREG(st.st_mode) and not continue_extraction:
if self.compare_and_extract_chunks(item, path, st=st, pi=pi):
return
elif stat.S_ISDIR(st.st_mode):
os.rmdir(path)
else:
os.unlink(path)This way we can try an in-place update attempt before any removal/recreation. If the function returns True, we're done. Otherwise, we fall back to the original remove/recreate behavior. Also, since I'm using
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I already said: the restore_attrs code expects a fresh state of the file (like newly created, no acls, no xattrs) and just adds the stuff from the archived item. But if you are updating an existing file, there can be already acls or xattrs that do not match what's in the archive (and what shall be the final state). |
||
| except UnicodeEncodeError: | ||
| raise self.IncompatibleFilesystemEncodingError(path, sys.getfilesystemencoding()) from None | ||
| except OSError: | ||
| pass | ||
| st = None | ||
|
|
||
| def make_parent(path): | ||
| parent_dir = os.path.dirname(path) | ||
|
|
@@ -821,6 +880,9 @@ def make_parent(path): | |
| with self.extract_helper(item, path, hlm) as hardlink_set: | ||
| if hardlink_set: | ||
| return | ||
| if self.compare_and_extract_chunks(item, path, st=st, pi=pi): | ||
| return | ||
|
|
||
| with backup_io("open"): | ||
| fd = open(path, "wb") | ||
| with fd: | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.