Примечание: Попытка загрузки файла из Kaggle заблокирована, поскольку вы еще не вошли в систему.
Вот скрипт для загрузки всех наборов данных соревнования.
from requests import get, post
from os import mkdir, remove
from os.path import exists
from shutil import rmtree
import zipfile
def purge_all_downloads(db_full_path):
# Removes all the downloaded datasets
if exists(db_full_path): rmtree(db_full_path)
def datasets_are_available_locally(db_full_path, datasets):
# Returns True only if all the competition datasets are available locally in Databricks CE
if not exists(db_full_path): return False
for df in datasets:
# Assumes all the datasets end with '.csv' extention
if not exists(db_full_path + df + '.csv'): return False
return True
def remove_zip_files(db_full_path, datasets):
for df in datasets:
remove(db_full_path + df + '.csv.zip')
def unzip(db_full_path, datasets):
for df in datasets:
with zipfile.ZipFile(db_full_path + df + '.csv.zip', 'r') as zf:
zf.extractall(db_full_path)
remove_zip_files(db_full_path, datasets)
def download_datasets(competition, db_full_path, datasets, username, password):
# Downloads the competition datasets if not availible locally
if datasets_are_available_locally(db_full_path, datasets):
print 'All the competition datasets have been downloaded, extraced and are ready for you !'
return
purge_all_downloads(db_full_path)
mkdir(db_full_path)
kaggle_info = {'UserName': username, 'Password': password}
for df in datasets:
url = (
'https://www.kaggle.com/account/login?ReturnUrl=' +
'/c/' + competition + '/download/'+ df + '.csv.zip'
)
request = post(url, data=kaggle_info, stream=True)
# write data to local file
with open(db_full_path + df + '.csv.zip', "w") as f:
for chunk in request.iter_content(chunk_size = 512 * 1024):
if chunk: f.write(chunk)
# extract competition data
unzip(db_full_path, datasets)
print('done !')
Для получения более подробной информации см. " Загрузите наборы данных соревнований напрямую ".
Надеюсь, это поможет.