Helper functions to download the fastai datasets

class Config[source]

Config()

config = Config()
config_path = config.config_path
config_file,config_bak = config_path/'config.yml',config_path/'config.yml.bak'
#This cell is just to make the config file compatible with current fastai
# TODO: make this a method that auto-runs as needed
if 'data_archive_path' not in config:
    config['data_archive_path'] = config.data_path
    config.save()
if config_file.exists(): shutil.move(config_file, config_bak)
#Test default config
config = Config()
assert config_file.exists()
test_eq(config.archive, config_path/'archive')

#Test change in config
config['archive_path'] = '.'
config.save()
config = Config()
test_eq(config.archive, Path('.'))

if config_bak.exists(): shutil.move(config_bak, config_file)
config = Config()

class URLs[source]

URLs()

Global constants for dataset and model URLs.

Downloading

download_url[source]

download_url(url, dest, overwrite=False, pbar=None, show_progress=True, chunk_size=1048576, timeout=4, retries=5)

Download url to dest unless it exists and not overwrite

url, fname = 'http://files.fast.ai/data/examples/mnist_tiny.tgz', Path('mnist_tiny.tgz')
try:
    download_url(url, fname)
    assert fname.exists()
    t = os.path.getmtime(fname)
    #Launching the function again doesn't trigger a download since the file is already there.
    download_url(url, fname)
    test_eq(t, os.path.getmtime(fname))
    #But with the overwrite option, we download it again.
    download_url(url, fname, overwrite=True)
    test_ne(t, os.path.getmtime(fname))
finally: fname.unlink()
try:
    os.makedirs('data', exist_ok=True)
    download_url(f"{URLs.MNIST_TINY}.tgz", 'data/mnist_tiny.tgz')
    test_eq(URLs.path(URLs.MNIST_TINY), Path.cwd()/'data'/'mnist_tiny.tgz')
finally:
    shutil.rmtree('data')

download_data[source]

download_data(url, fname=None, c_key='archive', force_download=False)

Download url to fname.

If fname is None, it will default to the archive folder you have in your config file (or data, model if you specify a different c_key) followed by the last part of the url: for instance URLs.MNIST_SAMPLE is http://files.fast.ai/data/examples/mnist_sample.tgz and the default value for fname will be ~/.fastai/archive/mnist_sample.tgz.

If force_download=True, the file is alwayd downloaded. Otherwise, it's only when the file doesn't exists that the download is triggered.

try:
    test_eq(download_data(URLs.MNIST_SAMPLE), config.archive/'mnist_sample.tgz')
    test_eq(download_data(URLs.MNIST_TINY, fname=Path('mnist.tgz')), Path('mnist.tgz'))
finally: Path('mnist.tgz').unlink()

try:
    tst_model = config.model/'mnist_tiny.tgz'
    test_eq(download_data(URLs.MNIST_TINY, c_key='model'), tst_model)
    os.remove(tst_model)
finally:
    if tst_model.exists(): tst_model.unlink()

Extract

file_extract[source]

file_extract(fname, dest=None)

Extract fname to dest using tarfile or `zipfile

file_extract is used by default in untar_data to decompress the downloaded file.

untar_data[source]

untar_data(url, fname=None, dest=None, c_key='data', force_download=False, extract_func='file_extract')

Download url to fname if dest doesn't exist, and un-tgz to folder dest.

untar_data is a convenience function for the fastai datasets, intended to work with the urls in URLs. You can use it with another url only if it ends with .tgz (otherwise the function can download it but not decompress it). For other extensions, you should use download_data then the necessary decompress function.

If fname is specified, the data will be downloaded to that destination, otherwise it will default to the archive path in your config file (default ~/.fastai/archive/) followed by the last part of your url. For instance URLs.MNIST_SAMPLE is http://files.fast.ai/data/examples/mnist_sample.tgz and the default value for fname will be ~/.fastai/archive/mnist_sample.tgz.

If dest is specified, the data will be decompressed to that folder. Otherwise, it will default to the data path (or model/archive if you specify a different c_key) in your config file (default ~/.fastai/data/) followed by the last part of your url without extension. For instance URLs.MNIST_SAMPLE is http://files.fast.ai/data/examples/mnist_sample.tgz and the default value for dest will be ~/.fastai/data/mnist_sample.

force_download=True will retrigger a download, otherwise the behavior is to:

  • not do anything when dest exists
  • otherwise decompress fname to dest if fname exists
  • otherwise download then decompress fname to dest

You can pass any function that takes fname and dest arguments as extract_func. By default, file_extract is used, which extracts the file using tarfile or zipfile, based on the extension.

test_eq(untar_data(URLs.MNIST_SAMPLE), config.data/'mnist_sample')

#Test specific fname
untar_data(URLs.MNIST_TINY, fname='mnist_tiny.tgz', force_download=True)
p = Path('mnist_tiny.tgz')
assert p.exists()
p.unlink()
    
#Test specific dest
test_eq(untar_data(URLs.MNIST_TINY, dest='.'), Path('mnist_tiny'))
assert Path('mnist_tiny').exists()
shutil.rmtree(Path('mnist_tiny'))

#Test c_key
tst_model = config.model/'mnist_sample'
test_eq(untar_data(URLs.MNIST_SAMPLE, c_key='model'), tst_model)
assert not tst_model.with_suffix('.tgz').exists() #Archive wasn't downloaded in the models path
assert (config.archive/'mnist_sample.tgz').exists() #Archive was downloaded there
shutil.rmtree(tst_model)