Source code for duplicity.progress

# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4; encoding:utf-8 -*-
#
# Copyright 2002 Ben Escoto <ben@emerose.org>
# Copyright 2007 Kenneth Loafman <kenneth@loafman.com>
#
# This file is part of duplicity.
#
# Duplicity is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation; either version 2 of the License, or (at your
# option) any later version.
#
# Duplicity is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with duplicity; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# @author: Juan Antonio Moya Vicen <juan@nowcomputing.com>
#
u"""
Functions to compute progress of compress & upload files
The heuristics try to infer the ratio between the amount of data collected
by the deltas and the total size of the changing files. It also infers the
compression and encryption ration of the raw deltas before sending them to
the backend.
With the inferred ratios, the heuristics estimate the percentage of completion
and the time left to transfer all the (yet unknown) amount of data to send.
This is a forecast based on gathered evidence.
"""

from __future__ import absolute_import
from __future__ import division
from builtins import object

from datetime import datetime, timedelta
import collections as sys_collections
import math
import pickle
import threading
import time

from duplicity import config
from duplicity import log
from duplicity import util

tracker = None
progress_thread = None


[docs]class Snapshot(sys_collections.deque): u""" A convenience class for storing snapshots in a space/timing efficient manner Stores up to 10 consecutive progress snapshots, one for each volume """
[docs] @staticmethod def unmarshall(): u""" De-serializes cached data it if present """ snapshot = Snapshot() # If restarting Full, discard marshalled data and start over if config.restart is not None and config.restart.start_vol >= 1: try: progressfd = open(u'%s/progress' % config.archive_dir_path.name, u'r') snapshot = pickle.load(progressfd) progressfd.close() except Exception as e: log.Warn(u"Warning, cannot read stored progress info from previous backup: {}".format(util.uexc(e)), log.WarningCode.cannot_stat) snapshot = Snapshot() # Reached here no cached data found or wrong marshalling return snapshot
[docs] def marshall(self): u""" Serializes object to cache """ progressfd = open(b'%s/progress' % config.archive_dir_path.name, u'wb+') pickle.dump(self, progressfd) progressfd.close()
[docs] def __init__(self, iterable=[], maxlen=10): super(Snapshot, self).__init__(iterable, maxlen) self.last_vol = 0
[docs] def get_snapshot(self, volume): nitems = len(self) if nitems <= 0: return 0.0 return self[max(0, min(nitems + volume - self.last_vol - 1, nitems - 1))]
[docs] def push_snapshot(self, volume, snapshot_data): self.append(snapshot_data) self.last_vol = volume
[docs] def pop_snapshot(self): return self.popleft()
[docs] def clear(self): super(Snapshot, self).clear() self.last_vol = 0
[docs]class ProgressTracker(object):
[docs] def __init__(self): self.total_stats = None self.nsteps = 0 self.start_time = None self.change_mean_ratio = 0.0 self.change_r_estimation = 0.0 self.progress_estimation = 0.0 self.time_estimation = 0 self.total_bytecount = 0 self.last_total_bytecount = 0 self.last_bytecount = 0 self.stall_last_time = None self.last_time = None self.elapsed_sum = timedelta() self.speed = 0.0 self.transfers = sys_collections.deque() self.is_full = False self.current_estimation = 0.0 self.prev_estimation = 0.0 self.prev_data = None
[docs] def snapshot_progress(self, volume): u""" Snapshots the current progress status for each volume into the disk cache If backup is interrupted, next restart will deserialize the data and try start progress from the snapshot """ if self.prev_data is not None: self.prev_data.push_snapshot(volume, self.progress_estimation) self.prev_data.marshall()
[docs] def has_collected_evidence(self): u""" Returns true if the progress computation is on and duplicity has not yet started the first dry-run pass to collect some information """ return (self.total_stats is not None)
[docs] def log_upload_progress(self): u""" Aproximative and evolving method of computing the progress of upload """ if not config.progress or not self.has_collected_evidence(): return current_time = datetime.now() if self.start_time is None: self.start_time = current_time if self.last_time is not None: elapsed = (current_time - self.last_time) else: elapsed = timedelta() self.last_time = current_time # Detect (and report) a stallment if no changing data for more than 5 seconds if self.stall_last_time is None: self.stall_last_time = current_time if (current_time - self.stall_last_time).seconds > max(5, 2 * config.progress_rate): log.TransferProgress(100.0 * self.progress_estimation, self.time_estimation, self.total_bytecount, (current_time - self.start_time).seconds, self.speed, True ) return self.nsteps += 1 u""" Compute the ratio of information being written for deltas vs file sizes Using Knuth algorithm to estimate approximate upper bound in % of completion The progress is estimated on the current bytes written vs the total bytes to change as estimated by a first-dry-run. The weight is the ratio of changing data (Delta) against the total file sizes. (pessimistic estimation) The method computes the upper bound for the progress, when using a sufficient large volsize to accomodate and changes, as using a small volsize may inject statistical noise. """ from duplicity import diffdir changes = diffdir.stats.NewFileSize + diffdir.stats.ChangedFileSize total_changes = self.total_stats.NewFileSize + self.total_stats.ChangedFileSize if total_changes == 0 or diffdir.stats.RawDeltaSize == 0: return # Snapshot current values for progress last_progress_estimation = self.progress_estimation if self.is_full: # Compute mean ratio of data transfer, assuming 1:1 data density self.current_estimation = float(self.total_bytecount) / float(total_changes) else: # Compute mean ratio of data transfer, estimating unknown progress change_ratio = float(self.total_bytecount) / float(diffdir.stats.RawDeltaSize) change_delta = change_ratio - self.change_mean_ratio self.change_mean_ratio += change_delta / float(self.nsteps) # mean cumulated ratio self.change_r_estimation += change_delta * (change_ratio - self.change_mean_ratio) change_sigma = math.sqrt(math.fabs(self.change_r_estimation / float(self.nsteps))) u""" Combine variables for progress estimation Fit a smoothed curve that covers the most common data density distributions, aiming for a large number of incremental changes. The computation is: Use 50% confidence interval lower bound during first half of the progression. Conversely, use 50% C.I. upper bound during the second half. Scale it to the changes/total ratio """ self.current_estimation = float(changes) / float(total_changes) * ( (self.change_mean_ratio - 0.67 * change_sigma) * (1.0 - self.current_estimation) + (self.change_mean_ratio + 0.67 * change_sigma) * self.current_estimation ) u""" In case that we overpassed the 100%, drop the confidence and trust more the mean as the sigma may be large. """ if self.current_estimation > 1.0: self.current_estimation = float(changes) / float(total_changes) * ( (self.change_mean_ratio - 0.33 * change_sigma) * (1.0 - self.current_estimation) + (self.change_mean_ratio + 0.33 * change_sigma) * self.current_estimation ) u""" Meh!, if again overpassed the 100%, drop the confidence to 0 and trust only the mean. """ if self.current_estimation > 1.0: self.current_estimation = self.change_mean_ratio * float(changes) / float(total_changes) u""" Lastly, just cap it... nothing else we can do to approximate it better. Cap it to 99%, as the remaining 1% to 100% we reserve for the last step uploading of signature and manifests """ self.progress_estimation = max(0.0, min(self.prev_estimation + (1.0 - self.prev_estimation) * self.current_estimation, 0.99)) u""" Estimate the time just as a projection of the remaining time, fit to a [(1 - x) / x] curve """ # As sum of timedeltas, so as to avoid clock skew in long runs # (adding also microseconds) self.elapsed_sum += elapsed projection = 1.0 if self.progress_estimation > 0: projection = (1.0 - self.progress_estimation) / self.progress_estimation self.time_estimation = int(projection * float(self.elapsed_sum.total_seconds())) # Apply values only when monotonic, so the estimates look more consistent to the human eye if self.progress_estimation < last_progress_estimation: self.progress_estimation = last_progress_estimation u""" Compute Exponential Moving Average of speed as bytes/sec of the last 30 probes """ if elapsed.total_seconds() > 0: self.transfers.append(float(self.total_bytecount - self.last_total_bytecount) / float(elapsed.total_seconds())) self.last_total_bytecount = self.total_bytecount if len(self.transfers) > 30: self.transfers.popleft() self.speed = 0.0 for x in self.transfers: self.speed = 0.3 * x + 0.7 * self.speed log.TransferProgress(100.0 * self.progress_estimation, self.time_estimation, self.total_bytecount, (current_time - self.start_time).seconds, self.speed, False )
[docs] def annotate_written_bytes(self, bytecount): u""" Annotate the number of bytes that have been added/changed since last time this function was called. bytecount param will show the number of bytes since the start of the current volume and for the current volume """ changing = max(bytecount - self.last_bytecount, 0) self.total_bytecount += int(changing) # Annotate only changing bytes since last probe self.last_bytecount = bytecount if changing > 0: self.stall_last_time = datetime.now()
[docs] def set_evidence(self, stats, is_full): u""" Stores the collected statistics from a first-pass dry-run, to use this information later so as to estimate progress """ self.total_stats = stats self.is_full = is_full
[docs] def set_start_volume(self, volume): self.prev_data = Snapshot.unmarshall() self.prev_estimation = self.prev_data.get_snapshot(volume) self.progress_estimation = max(0.0, min(self.prev_estimation, 0.99))
[docs] def total_elapsed_seconds(self): u""" Elapsed seconds since the first call to log_upload_progress method """ return (datetime.now() - self.start_time).seconds
[docs]def report_transfer(bytecount, totalbytes): # pylint: disable=unused-argument u""" Method to call tracker.annotate_written_bytes from outside the class, and to offer the "function(long, long)" signature which is handy to pass as callback """ global tracker global progress_thread if progress_thread is not None and tracker is not None: tracker.annotate_written_bytes(bytecount)
[docs]class LogProgressThread(threading.Thread): u""" Background thread that reports progress to the log, every --progress-rate seconds """
[docs] def __init__(self): super(LogProgressThread, self).__init__() self.setDaemon(True) self.finished = False
[docs] def run(self): global tracker if not config.dry_run and config.progress and tracker.has_collected_evidence(): while not self.finished: tracker.log_upload_progress() time.sleep(config.progress_rate)