s3 urls - get bucket name and path
For those who like me was trying to use urlparse to extract key and bucket in order to create object with boto3. There's one important detail: remove slash from the beginning of the key
from urlparse import urlparse
o = urlparse('s3://bucket_name/folder1/folder2/file1.json')
bucket = o.netloc
key = o.path
boto3.client('s3')
client.put_object(Body='test', Bucket=bucket, Key=key.lstrip('/'))
It took a while to realize that because boto3 doesn't throw any exception.
A solution that works without urllib or re (also handles preceding slash):
def split_s3_path(s3_path):
path_parts=s3_path.replace("s3://","").split("/")
bucket=path_parts.pop(0)
key="/".join(path_parts)
return bucket, key
To run:
bucket, key = split_s3_path("s3://my-bucket/some_folder/another_folder/my_file.txt")
Returns:
bucket: my-bucket
key: some_folder/another_folder/my_file.txt
Since it's just a normal URL, you can use urlparse
to get all the parts of the URL.
>>> from urlparse import urlparse
>>> o = urlparse('s3://bucket_name/folder1/folder2/file1.json', allow_fragments=False)
>>> o
ParseResult(scheme='s3', netloc='bucket_name', path='/folder1/folder2/file1.json', params='', query='', fragment='')
>>> o.netloc
'bucket_name'
>>> o.path
'/folder1/folder2/file1.json'
You may have to remove the beginning slash from the key as the next answer suggests.
o.path.lstrip('/')
With Python 3 urlparse
moved to urllib.parse
so use:
from urllib.parse import urlparse
Here's a class that takes care of all the details.
try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse
class S3Url(object):
"""
>>> s = S3Url("s3://bucket/hello/world")
>>> s.bucket
'bucket'
>>> s.key
'hello/world'
>>> s.url
's3://bucket/hello/world'
>>> s = S3Url("s3://bucket/hello/world?qwe1=3#ddd")
>>> s.bucket
'bucket'
>>> s.key
'hello/world?qwe1=3#ddd'
>>> s.url
's3://bucket/hello/world?qwe1=3#ddd'
>>> s = S3Url("s3://bucket/hello/world#foo?bar=2")
>>> s.key
'hello/world#foo?bar=2'
>>> s.url
's3://bucket/hello/world#foo?bar=2'
"""
def __init__(self, url):
self._parsed = urlparse(url, allow_fragments=False)
@property
def bucket(self):
return self._parsed.netloc
@property
def key(self):
if self._parsed.query:
return self._parsed.path.lstrip('/') + '?' + self._parsed.query
else:
return self._parsed.path.lstrip('/')
@property
def url(self):
return self._parsed.geturl()
Pretty easy to accomplish with a single line of builtin string methods...
s3_filepath = "s3://bucket-name/and/some/key.txt"
bucket, key = s3_filepath.replace("s3://", "").split("/", 1)