Track download progress of S3 file using boto3 and callbacks
Install progressbar
with pip3 install progressbar
import boto3, os
import progressbar
bucket_name = "<your-s3-bucket-name>"
folder_name = "<your-directory-name-locally>"
file_name = "<your-filename-locally>"
path = folder_name + "/" + file_name
s3 = boto3.client('s3', aws_access_key_id="<your_aws_access_key_id>", aws_secret_access_key="<your_aws_secret_access_key>")
statinfo = os.stat(file_name)
up_progress = progressbar.progressbar.ProgressBar(maxval=statinfo.st_size)
up_progress.start()
def upload_progress(chunk):
up_progress.update(up_progress.currval + chunk)
s3.upload_file(file_name, bucket_name, path, Callback=upload_progress)
up_progress.finish()
Following the official document, it is not quite difficult to apply progress tracking (download_file and upload_file functions are similar). Here is the full code with some modifications to see the data size in preferred manner.
import logging
import boto3
from botocore.exceptions import ClientError
import os
import sys
import threading
import math
ACCESS_KEY = 'xxx'
SECRET_KEY = 'xxx'
REGION_NAME= 'ap-southeast-1'
class ProgressPercentage(object):
def __init__(self, filename, filesize):
self._filename = filename
self._size = filesize
self._seen_so_far = 0
self._lock = threading.Lock()
def __call__(self, bytes_amount):
def convertSize(size):
if (size == 0):
return '0B'
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
i = int(math.floor(math.log(size,1024)))
p = math.pow(1024,i)
s = round(size/p,2)
return '%.2f %s' % (s,size_name[i])
# To simplify, assume this is hooked up to a single filename
with self._lock:
self._seen_so_far += bytes_amount
percentage = (self._seen_so_far / self._size) * 100
sys.stdout.write(
"\r%s %s / %s (%.2f%%) " % (
self._filename, convertSize(self._seen_so_far), convertSize(self._size),
percentage))
sys.stdout.flush()
def download_file(file_name, object_name, bucket_name):
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = file_name
# Initialize s3 client
s3_client = boto3.client(service_name="s3",
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
region_name=REGION_NAME)
try:
response = s3_client.download_file(
Bucket=bucket_name,
Key=object_name,
Filename=file_name,
Callback=ProgressPercentage(file_name, (s3_client.head_object(Bucket=bucket_name, Key=object_name))["ContentLength"])
)
except ClientError as e:
logging.error(e)
return False
return True
file_name = "./output.csv.gz"
bucket_name = "mybucket"
object_name = "result/output.csv.gz"
download_file(file_name, object_name, bucket_name )
callback = ProgressPercentage(LOCAL_PATH_TEMP + FILE_NAME))
creates a ProgressPercentage
object, runs its __init__
method, and passes the object as callback
to the download_file
method. This means the __init__
method is run before download_file
begins.
In the __init__
method you are attempting to read the size of the local file being downloaded to, which throws an exception as the file does not exist since the download has yet to start. If you've already downloaded the file, then there's no problem since a local copy exists and its size can be read.
Of course, this is merely the cause of the exception you're seeing. You're using the _size
property as the maximum value of download progress. However you're attempting to use the size of the local file. Until the file is completely downloaded, the local file system does not know how large the file is, it only knows how much space it takes up right now. This means as you download the file will gradually get bigger until it reaches its full size. As such, it doesn't really make sense to consider the size of the local file as the maximum size of the download. It may work in the case where you've already downloaded the file, but that isn't very useful.
The solution to your problem would be to check the size of the file you're going to download, instead of the size of the local copy. This ensures you're getting the actual size of whatever it is you're downloading, and that the file exists (as you couldn't be downloading it if it didn't). You can do this by getting the size of the remote file with head_object
as follows
class ProgressPercentage(object):
def __init__(self, client, bucket, filename):
# ... everything else the same
self._size = client.head_object(Bucket=bucket, Key=filename).ContentLength
# ...
# If you still have the client object you could pass that directly
# instead of transfer._manager._client
progress = ProgressPercentage(transfer._manager._client, BUCKET_NAME, FILE_NAME)
transfer.download_file(..., callback=progress)
As a final note, although you got the code from the Boto3 documentation, it didn't work because it was intended for file uploads. In that case the local file is the source and its existence guaranteed.
This is my implementation. No other dependencies, hack up the progress callback function to display whatever you want.
import sys
import boto3
s3_client = boto3.client('s3')
def download(local_file_name, s3_bucket, s3_object_key):
meta_data = s3_client.head_object(Bucket=s3_bucket, Key=s3_object_key)
total_length = int(meta_data.get('ContentLength', 0))
downloaded = 0
def progress(chunk):
nonlocal downloaded
downloaded += chunk
done = int(50 * downloaded / total_length)
sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
sys.stdout.flush()
print(f'Downloading {s3_object_key}')
with open(local_file_name, 'wb') as f:
s3_client.download_fileobj(s3_bucket, s3_object_key, f, Callback=progress)
e.g.
local_file_name = 'test.csv'
s3_bucket = 'my-bucket'
s3_object_key = 'industry/test.csv'
download(local_file_name, s3_bucket, s3_object_key)
Demo:
Tested with boto3>=1.14.19
, python>=3.7