Source code for brainstat.tutorial.utils

import warnings
from pathlib import Path
import pandas as pd
from sklearn.utils import Bunch
from nilearn.datasets.utils import _get_dataset_dir, _fetch_files


[docs]def fetch_tutorial_data(n_subjects=20, data_dir=None, resume=True, verbose=1): """Download and load the surfstat tutorial dataset. Parameters ---------- n_subjects: int, optional The number of subjects to load from maximum of 100 subjects. By default, 20 subjects will be loaded. If None is given, all 100 subjects will be loaded. data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. If None, data will be download to ~ (home directory). Default: None resume: bool, optional If true, try resuming download if possible Returns ------- data: sklearn.datasets.base.Bunch Dictionary-like object, the interest attributes are : - 'image_files': Paths to image files in mgh format - 'demographics': Path to CSV file containing demographic information References ---------- :Download: https://box.bic.mni.mcgill.ca/s/wMPF2vj7EoYWELV """ # set dataset url url = "https://box.bic.mni.mcgill.ca/s/wMPF2vj7EoYWELV" # set data_dir, if not directly set use ~ as default if data_dir is None: data_dir = str(Path.home()) # set dataset name and get its corresponding directory dataset_name = "brainstat_tutorial" data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose) # set download information for demographic file files = [ ( "brainstat_tutorial_df.csv", url + "/download?path=%2FSurfStat_tutorial_data&files=myStudy.csv", {"move": "brainstat_tutorial_df.csv"}, ) ] # download demographic file path_to_demographics = _fetch_files(data_dir, files, verbose=verbose)[0] # set ids based on complete dataset from demographic file ids = pd.read_csv(path_to_demographics)["ID2"].tolist() # set and check subjects, in total and subset max_subjects = len(ids) if n_subjects is None: n_subjects = max_subjects if n_subjects > max_subjects: warnings.warn("Warning: there are only %d subjects" % max_subjects) n_subjects = max_subjects ids = ids[:n_subjects] # restrict demographic information to subset of subjects df_tmp = pd.read_csv(path_to_demographics) df_tmp = df_tmp[df_tmp["ID2"].isin(ids)] # set download information for image files and download them # for hemi in ['lh', 'rh']: image_files = _fetch_files( data_dir, [ ( "thickness/{}_{}2fsaverage5_20.mgh".format(subj, hemi), url + "/download?path=%2F&files=brainstat_tutorial.zip", {"uncompress": True, "move": "brainstat_tutorial.zip"}, ) for subj in ids for hemi in ["lh", "rh"] ], ) # pack everything in a scikit-learn bunch and return it return Bunch(demographics=df_tmp, image_files=image_files)