Source code for brainstat.tutorial.utils

import warnings
from pathlib import Path
import pandas as pd
from sklearn.utils import Bunch
from nilearn.datasets.utils import _get_dataset_dir, _fetch_files


[docs]def fetch_tutorial_data(n_subjects=20, data_dir=None, resume=True, verbose=1):

    """Download and load the surfstat tutorial dataset.

    Parameters
    ----------
    n_subjects: int, optional
        The number of subjects to load from maximum of 100 subjects.
        By default, 20 subjects will be loaded. If None is given,
        all 100 subjects will be loaded.
    data_dir: string, optional
        Path of the data directory. Used to force data storage in a specified
        location. If None, data will be download to ~ (home directory).
        Default: None
    resume: bool, optional
        If true, try resuming download if possible

    Returns
    -------
    data: sklearn.datasets.base.Bunch
        Dictionary-like object, the interest attributes are :
         - 'image_files': Paths to image files in mgh format
         - 'demographics': Path to CSV file containing demographic information

    References
    ----------

    :Download: https://box.bic.mni.mcgill.ca/s/wMPF2vj7EoYWELV

    """

    # set dataset url
    url = "https://box.bic.mni.mcgill.ca/s/wMPF2vj7EoYWELV"

    # set data_dir, if not directly set use ~ as default
    if data_dir is None:
        data_dir = str(Path.home())

    # set dataset name and get its corresponding directory
    dataset_name = "brainstat_tutorial"
    data_dir = _get_dataset_dir(dataset_name, data_dir=data_dir, verbose=verbose)

    # set download information for demographic file
    files = [
        (
            "brainstat_tutorial_df.csv",
            url + "/download?path=%2FSurfStat_tutorial_data&files=myStudy.csv",
            {"move": "brainstat_tutorial_df.csv"},
        )
    ]

    # download demographic file
    path_to_demographics = _fetch_files(data_dir, files, verbose=verbose)[0]

    # set ids based on complete dataset from demographic file
    ids = pd.read_csv(path_to_demographics)["ID2"].tolist()

    # set and check subjects, in total and subset
    max_subjects = len(ids)
    if n_subjects is None:
        n_subjects = max_subjects
    if n_subjects > max_subjects:
        warnings.warn("Warning: there are only %d subjects" % max_subjects)
        n_subjects = max_subjects
    ids = ids[:n_subjects]

    # restrict demographic information to subset of subjects
    df_tmp = pd.read_csv(path_to_demographics)
    df_tmp = df_tmp[df_tmp["ID2"].isin(ids)]

    # set download information for image files and download them
    # for hemi in ['lh', 'rh']:
    image_files = _fetch_files(
        data_dir,
        [
            (
                "thickness/{}_{}2fsaverage5_20.mgh".format(subj, hemi),
                url + "/download?path=%2F&files=brainstat_tutorial.zip",
                {"uncompress": True, "move": "brainstat_tutorial.zip"},
            )
            for subj in ids
            for hemi in ["lh", "rh"]
        ],
    )

    # pack everything in a scikit-learn bunch and return it
    return Bunch(demographics=df_tmp, image_files=image_files)