diff --git a/GANDLF/cli/huggingface_hub_handler.py b/GANDLF/cli/huggingface_hub_handler.py new file mode 100644 index 000000000..72e2f35b0 --- /dev/null +++ b/GANDLF/cli/huggingface_hub_handler.py @@ -0,0 +1,142 @@ +from huggingface_hub import HfApi, snapshot_download, ModelCardData, ModelCard +from typing import List, Union +from GANDLF import version +from pathlib import Path +from GANDLF.utils import get_git_hash +import re + + +def validate_model_card(file_path: str): + """ + Validate that the required fields in the model card are not null, empty, or set to 'REQUIRED_FOR_GANDLF'. + The fields must contain valid alphabetic or alphanumeric values. + + Args: + file_path (str): The path to the Markdown file to validate. + + Raises: + AssertionError: If any required field is missing, empty, null, or contains 'REQUIRED_FOR_GANDLF'. + """ + # Read the Markdown file + path = Path(file_path) + with path.open("r") as file: + template_str = file.read() + + # Define required fields and their regex patterns to capture the values + patterns = { + "Developed by": re.compile( + r'\*\*Developed by:\*\*\s*\{\{\s*developers\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}', + re.MULTILINE, + ), + "License": re.compile( + r'\*\*License:\*\*\s*\{\{\s*license\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}', + re.MULTILINE, + ), + "Primary Organization": re.compile( + r'\*\*Primary Organization:\*\*\s*\{\{\s*primary_organization\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}', + re.MULTILINE, + ), + "Commercial use policy": re.compile( + r'\*\*Commercial use policy:\*\*\s*\{\{\s*commercial_use\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}', + re.MULTILINE, + ), + } + + # Iterate through the required fields and validate + for field, pattern in patterns.items(): + match = pattern.search(template_str) + + # Ensure the field is present and does not contain 'REQUIRED_FOR_GANDLF' + assert match, f"Field '{field}' is missing or not found in the file." + + extract_value = match.group(1) + + # Get the field value + value = ( + re.search(r"\[([^\]]+)\]", extract_value).group(1) + if re.search(r"\[([^\]]+)\]", extract_value) + else None + ) + + # Ensure the field is not set to 'REQUIRED_FOR_GANDLF' or empty + assert ( + value != "REQUIRED_FOR_GANDLF" + ), f"The value for '{field}' is set to the default placeholder '[REQUIRED_FOR_GANDLF]'. It must be a valid value." + assert value, f"The value for '{field}' is empty or null." + + # Ensure the value contains only alphabetic or alphanumeric characters + assert re.match( + r"^[a-zA-Z0-9]+$", value + ), f"The value for '{field}' must be alphabetic or alphanumeric, but got: '{value}'" + + print( + "All required fields are valid, non-empty, properly filled, and do not contain '[REQUIRED_FOR_GANDLF]'." + ) + + # Example usage + return template_str + + +def push_to_model_hub( + repo_id: str, + folder_path: str, + hf_template: str, + path_in_repo: Union[str, None] = None, + commit_message: Union[str, None] = None, + commit_description: Union[str, None] = None, + token: Union[str, None] = None, + repo_type: Union[str, None] = None, + revision: Union[str, None] = None, + allow_patterns: Union[List[str], str, None] = None, + ignore_patterns: Union[List[str], str, None] = None, + delete_patterns: Union[List[str], str, None] = None, +): + api = HfApi(token=token) + + try: + repo_id = api.create_repo(repo_id).repo_id + except Exception as e: + print(f"Error: {e}") + + tags = ["v" + version] + + git_hash = get_git_hash() + + if not git_hash == "None": + tags += [git_hash] + + readme_template = validate_model_card(hf_template) + + card_data = ModelCardData(library_name="GaNDLF", tags=tags) + card = ModelCard.from_template(card_data, template_str=readme_template) + + card.save(Path(folder_path, "README.md")) + + api.upload_folder( + repo_id=repo_id, + folder_path=folder_path, + repo_type="model", + revision=revision, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + delete_patterns=delete_patterns, + ) + print("Model Sucessfully Uploded") + + +def download_from_hub( + repo_id: str, + revision: Union[str, None] = None, + cache_dir: Union[str, None] = None, + local_dir: Union[str, None] = None, + force_download: bool = False, + token: Union[str, None] = None, +): + snapshot_download( + repo_id=repo_id, + revision=revision, + cache_dir=cache_dir, + local_dir=local_dir, + force_download=force_download, + token=token, + ) diff --git a/GANDLF/entrypoints/hf_hub_integration.py b/GANDLF/entrypoints/hf_hub_integration.py new file mode 100644 index 000000000..353f31dfb --- /dev/null +++ b/GANDLF/entrypoints/hf_hub_integration.py @@ -0,0 +1,156 @@ +import click +from GANDLF.entrypoints import append_copyright_to_help +from GANDLF.cli.huggingface_hub_handler import push_to_model_hub, download_from_hub +from pathlib import Path + +huggingfaceDir_ = Path(__file__).parent.absolute() + +huggingfaceDir = huggingfaceDir_.parent + +# Huggingface template by default Path for the Model Deployment +huggingface_file_path = huggingfaceDir / "hugging_face.md" + + +@click.command() +@click.option( + "--upload/--download", + "-u/-d", + required=True, + help="Upload or download to/from a Huggingface Repo", +) +@click.option( + "--repo-id", + "-rid", + required=True, + help="Downloading/Uploading: A user or an organization name and a repo name separated by a /", +) +@click.option( + "--token", + "-tk", + help="Downloading/Uploading: A token to be used for the download/upload", +) +@click.option( + "--revision", + "-rv", + help="Downloading/Uploading: git revision id which can be a branch name, a tag, or a commit hash", +) +@click.option( + "--cache-dir", + "-cdir", + help="Downloading: path to the folder where cached files are stored", + type=click.Path(exists=True, file_okay=False, dir_okay=True), +) +@click.option( + "--local-dir", + "-ldir", + help="Downloading: if provided, the downloaded file will be placed under this directory", + type=click.Path(exists=True, file_okay=False, dir_okay=True), +) +@click.option( + "--force-download", + "-fd", + is_flag=True, + help="Downloading: Whether the file should be downloaded even if it already exists in the local cache", +) +@click.option( + "--folder-path", + "-fp", + help="Uploading: Path to the folder to upload on the local file system", + type=click.Path(exists=True, file_okay=False, dir_okay=True), +) +@click.option( + "--path-in-repo", + "-pir", + help="Uploading: Relative path of the directory in the repo. Will default to the root folder of the repository", +) +@click.option( + "--commit-message", + "-cr", + help='Uploading: The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"', +) +@click.option( + "--commit-description", + "-cd", + help="Uploading: The description of the generated commit", +) +@click.option( + "--repo-type", + "-rt", + help='Uploading: Set to "dataset" or "space" if uploading to a dataset or space, "model" if uploading to a model. Default is model', +) +@click.option( + "--allow-patterns", + "-ap", + help="Uploading: If provided, only files matching at least one pattern are uploaded.", +) +@click.option( + "--ignore-patterns", + "-ip", + help="Uploading: If provided, files matching any of the patterns are not uploaded.", +) +@click.option( + "--delete-patterns", + "-dp", + help="Uploading: If provided, remote files matching any of the patterns will be deleted from the repo while committing new files. This is useful if you don't know which files have already been uploaded.", +) +@click.option( + "--hf-template", + "-hft", + help="Adding the template path for the model card it is Required during Uploaing a model", + default=huggingface_file_path, + type=click.Path(exists=True, file_okay=True, dir_okay=False), +) +@append_copyright_to_help +def new_way( + upload: bool, + repo_id: str, + token: str, + hf_template: str, + revision: str, + cache_dir: str, + local_dir: str, + force_download: bool, + folder_path: str, + path_in_repo: str, + commit_message: str, + commit_description: str, + repo_type: str, + allow_patterns: str, + ignore_patterns: str, + delete_patterns: str, +): + # """Manages model transfers to and from the Hugging Face Hub""" + # """Manages model transfers to and from the Hugging Face Hub""" + + # # Ensure the hf_template is being passed and loaded correctly + # template_path = Path(hf_template) + + # # Check if file exists and is readable + # if not template_path.exists(): + # raise FileNotFoundError(f"Model card template file '{hf_template}' not found.") + + # with template_path.open('r') as f: + # hf_template = f.read() + + # # Debug print the content to ensure it's being read + # print(f"Template content: {type(hf_template)}...") # Print the first 100 chars as a preview + + if upload: + push_to_model_hub( + repo_id, + folder_path, + hf_template, + path_in_repo, + commit_message, + commit_description, + token, + repo_type, + revision, + allow_patterns, + ignore_patterns, + delete_patterns, + ) + else: + download_from_hub( + repo_id, revision, cache_dir, local_dir, force_download, token + ) diff --git a/GANDLF/entrypoints/subcommands.py b/GANDLF/entrypoints/subcommands.py index 814b00b66..354229869 100644 --- a/GANDLF/entrypoints/subcommands.py +++ b/GANDLF/entrypoints/subcommands.py @@ -12,6 +12,7 @@ from GANDLF.entrypoints.generate_metrics import new_way as generate_metrics_command from GANDLF.entrypoints.debug_info import new_way as debug_info_command from GANDLF.entrypoints.split_csv import new_way as split_csv_command +from GANDLF.entrypoints.hf_hub_integration import new_way as hf_command cli_subcommands = { @@ -29,4 +30,5 @@ "generate-metrics": generate_metrics_command, "debug-info": debug_info_command, "split-csv": split_csv_command, + "hf": hf_command, } diff --git a/GANDLF/hugging_face.md b/GANDLF/hugging_face.md new file mode 100644 index 000000000..782a42746 --- /dev/null +++ b/GANDLF/hugging_face.md @@ -0,0 +1,203 @@ +--- +# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 +# Doc / guide: https://huggingface.co/docs/hub/model-cards +# Any fields required by GaNDLF are marked with REQUIRED_FOR_GANDLF, and will be checked +{{ card_data }} +--- + +# Model Card for {{ model_id | default("Model ID", true) }} + + + +{{ model_summary | default("", true) }} + +## Model Details + +### Model Description + + + +{{ model_description | default("", true) }} + +- **Developed by:** {{ developers | default("[GANDLF]", true)}} +- **License:** {{ license | default("[GANDLF]", true)}} +- **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}} +- **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}} +- **Model type:** {{ model_type | default("[More Information Needed]", true)}} +- **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}} +- **Finetuned from model [optional]:** {{ base_model | default("[More Information Needed]", true)}} +- **Primary Organization:** {{ primary_organization | default("[GANDLF]", true)}} +- **Commercial use policy:** {{ commercial_use | default("[GANDLF]", true)}} + +### Model Sources [optional] + + + +- **Repository:** {{ repo | default("[https://github.com/mlcommons/GaNDLF]", true)}} +- **Paper [optional]:** {{ paper | default("[More Information Needed]", true)}} +- **Demo [optional]:** {{ demo | default("[More Information Needed]", true)}} + +## Uses + + + +### Direct Use + + + +{{ direct_use | default("[More Information Needed]", true)}} + +### Downstream Use [optional] + + + +{{ downstream_use | default("[More Information Needed]", true)}} + +### Out-of-Scope Use + + + +{{ out_of_scope_use | default("[More Information Needed]", true)}} + +## Bias, Risks, and Limitations + + + +{{ bias_risks_limitations | default("[More Information Needed]", true)}} + +### Recommendations + + + +{{ bias_recommendations | default("Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.", true)}} + +## How to Get Started with the Model + +Use the code below to get started with the model. + +{{ get_started_code | default("[More Information Needed]", true)}} + +## Training Details + +### Training Data + + + +{{ training_data | default("[More Information Needed]", true)}} + +### Training Procedure + + + +#### Preprocessing [optional] + +{{ preprocessing | default("[More Information Needed]", true)}} + + +#### Training Hyperparameters + +- **Training regime:** {{ training_regime | default("[More Information Needed]", true)}} + +#### Speeds, Sizes, Times [optional] + + + +{{ speeds_sizes_times | default("[More Information Needed]", true)}} + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +{{ testing_data | default("[More Information Needed]", true)}} + +#### Factors + + + +{{ testing_factors | default("[More Information Needed]", true)}} + +#### Metrics + + + +{{ testing_metrics | default("[More Information Needed]", true)}} + +### Results + +{{ results | default("[More Information Needed]", true)}} + +#### Summary + +{{ results_summary | default("", true) }} + +## Model Examination [optional] + + + +{{ model_examination | default("[More Information Needed]", true)}} + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** {{ hardware_type | default("[More Information Needed]", true)}} +- **Hours used:** {{ hours_used | default("[More Information Needed]", true)}} +- **Cloud Provider:** {{ cloud_provider | default("[More Information Needed]", true)}} +- **Compute Region:** {{ cloud_region | default("[More Information Needed]", true)}} +- **Carbon Emitted:** {{ co2_emitted | default("[More Information Needed]", true)}} + +## Technical Specifications [optional] + +### Model Architecture and Objective + +{{ model_specs | default("[More Information Needed]", true)}} + +### Compute Infrastructure + +{{ compute_infrastructure | default("[More Information Needed]", true)}} + +#### Hardware + +{{ hardware_requirements | default("[More Information Needed]", true)}} + +#### Software + +{{ software | default("[More Information Needed]", true)}} + +## Citation [optional] + + + +**BibTeX:** + +{{ citation_bibtex | default("[More Information Needed]", true)}} + +**APA:** + +{{ citation_apa | default("[More Information Needed]", true)}} + +## Glossary [optional] + + + +{{ glossary | default("[More Information Needed]", true)}} + +## More Information [optional] + +{{ more_information | default("[More Information Needed]", true)}} + +## Model Card Authors [optional] + +{{ model_card_authors | default("[More Information Needed]", true)}} + +## Model Card Contact + +{{ model_card_contact | default("[More Information Needed]", true)}} diff --git a/GANDLF/training_manager.py b/GANDLF/training_manager.py index e605af6f6..1fe74f0d3 100644 --- a/GANDLF/training_manager.py +++ b/GANDLF/training_manager.py @@ -5,6 +5,8 @@ from GANDLF.compute import training_loop from GANDLF.utils import get_dataframe, split_data +import yaml + def TrainingManager( dataframe: pd.DataFrame, @@ -158,6 +160,8 @@ def TrainingManager_split( reset (bool): Whether the previous run will be reset or not. """ currentModelConfigPickle = os.path.join(outputDir, "parameters.pkl") + currentModelConfigYaml = os.path.join(outputDir, "config.yaml") + if (not os.path.exists(currentModelConfigPickle)) or reset or resume: with open(currentModelConfigPickle, "wb") as handle: pickle.dump(parameters, handle, protocol=pickle.HIGHEST_PROTOCOL) @@ -170,6 +174,10 @@ def TrainingManager_split( ) parameters = pickle.load(open(currentModelConfigPickle, "rb")) + if (not os.path.exists(currentModelConfigYaml)) or reset or resume: + with open(currentModelConfigYaml, "w") as handle: + yaml.dump(parameters, handle, default_flow_style=False) + training_loop( training_data=dataframe_train, validation_data=dataframe_validation, diff --git a/docs/usage.md b/docs/usage.md index 609b746b2..1f56947c9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -24,7 +24,7 @@ Please follow the [installation instructions](./setup.md#installation) to instal ### Anonymize Data -A major reason why one would want to anonymize data is to ensure that trained models do not inadvertently encode protected health information [[1](https://doi.org/10.1145/3436755),[2](https://doi.org/10.1038/s42256-020-0186-1)]. GaNDLF can anonymize one or multiple images using the `gandlf anonymizer` command as follows: +A major reason why one would want to anonymize data is to ensure that trained models do not inadvertently do not encode protect health information [[1](https://doi.org/10.1145/3436755),[2](https://doi.org/10.1038/s42256-020-0186-1)]. GaNDLF can anonymize single images or a collection of images using the `gandlf anonymizer` command. It can be used as follows: ```bash # continue from previous shell @@ -81,7 +81,7 @@ Once these files are present, the patch miner can be run using the following com ### Running preprocessing before training/inference (optional) -Running preprocessing before training/inference is optional, but recommended. It will significantly reduce the computational footprint during training/inference at the expense of larger storage requirements. Use the following command, which will save the processed data in `./experiment_0/output_dir/` with a new data CSV and the corresponding model configuration: +Running preprocessing before training/inference is optional, but recommended. It will significantly reduce the computational footprint during training/inference at the expense of larger storage requirements. To run preprocessing before training/inference you can use the following command, which will save the processed data in `./experiment_0/output_dir/` with a new data CSV and the corresponding model configuration: ```bash # continue from previous shell @@ -108,7 +108,7 @@ N,/full/path/N/0.nii.gz,/full/path/N/1.nii.gz,...,/full/path/N/X.nii.gz,/full/pa **Notes:** - `Channel` can be substituted with `Modality` or `Image` -- `Label` can be substituted with `Mask` or `Segmentation` and is used to specify the annotation file for segmentation models +- `Label` can be substituted with `Mask` or `Segmentation`and is used to specify the annotation file for segmentation models - For classification/regression, add a column called `ValueToPredict`. Currently, we are supporting only a single value prediction per model. - Only a single `Label` or `ValueToPredict` header should be passed - Multiple segmentation classes should be in a single file with unique label numbers. @@ -152,14 +152,14 @@ The following command shows how the script works: (venv_gandlf) $> gandlf construct-csv \ # -h, --help Show help message and exit -i $DATA_DIRECTORY # this is the main data directory - -c _t1.nii.gz,_t1ce.nii.gz,_t2.nii.gz,_flair.nii.gz \ # an example image identifier for 4 structural brain MR sequences for BraTS, and can be changed based on your data. In the simplest case of a single modality, a ".nii.gz" will suffice + -c _t1.nii.gz,_t1ce.nii.gz,_t2.nii.gz,_flair.nii.gz \ # an example image identifier for 4 structural brain MR sequences for BraTS, and can be changed based on your data -l _seg.nii.gz \ # an example label identifier - not needed for regression/classification, and can be changed based on your data -o ./experiment_0/train_data.csv # output CSV to be used for training ``` **Notes**: -- For classification/regression, add a column called `ValueToPredict`. Currently, we support only a single value prediction per model. +- For classification/regression, add a column called `ValueToPredict`. Currently, we are supporting only a single value prediction per model. - `SubjectID` or `PatientName` is used to ensure that the randomized split is done per-subject rather than per-image. - For data arrangement different to what is described above, a customized script will need to be written to generate the CSV, or you can enter the data manually into the CSV. @@ -179,15 +179,13 @@ To split the data CSV into training, validation, and testing CSVs, the `gandlf s ## Customize the Training -Adapting GaNDLF to your needs boils down to modifying a YAML-based configuration file which controls the parameters of training and inference. Below is a list of available samples for users to start as their baseline for further customization: +GaNDLF requires a YAML-based configuration that controls various aspects of the training/inference process. There are multiple samples for users to start as their baseline for further customization. A list of the available samples is presented as follows: +- [Sample showing all the available options](https://github.com/mlcommons/GaNDLF/blob/master/samples/config_all_options.yaml) - [Segmentation example](https://github.com/mlcommons/GaNDLF/blob/master/samples/config_segmentation_brats.yaml) - [Regression example](https://github.com/mlcommons/GaNDLF/blob/master/samples/config_regression.yaml) - [Classification example](https://github.com/mlcommons/GaNDLF/blob/master/samples/config_classification.yaml) -To find **all the parameters** a GaNDLF config may modify, consult the following file: -- [All available options](https://github.com/mlcommons/GaNDLF/blob/master/samples/config_all_options.yaml) - **Notes**: - More details on the configuration options are available in the [customization page](customize.md). @@ -512,3 +510,111 @@ This can be replicated for ROCm for AMD , by following the [instructions to set GaNDLF, and GaNDLF-created models, may be distributed as an [MLCube](https://mlcommons.github.io/mlcube/). This involves distributing an `mlcube.yaml` file. That file can be specified when using the [MLCube runners](https://mlcommons.github.io/mlcube/runners/). The runner will perform many aspects of configuring your container for you. Currently, only the `mlcube_docker` runner is supported. See the [MLCube documentation](https://mlcommons.github.io/mlcube/) for more details. + +## HuggingFace CLI + +This tool allows you to interact with the Hugging Face Hub directly from a terminal. For example, you can create a repository, upload and download files, etc. + +### Download an entire repository +GaNDLF's Hugging Face CLI allows you to download repositories through the command line. This can be done by just specifying the repo id: + +```bash +(main) $> gandlf hf --download --repo-id HuggingFaceH4/zephyr-7b-beta +``` + +Apart from the Repo Id you can also provide other arguments. + +### Revision +To download from a specific revision (commit hash, branch name or tag), use the --revision option: +```bash +(main) $> gandlf hf --download --repo-id distilbert-base-uncased revision --revision v1.1 +``` +### Specify a token +To access private or gated repositories, you must use a token. You can do this using the --token option: + +```bash +(main) $> gandlf hf --download --repo-id distilbert-base-uncased revision --revision v1.1 --token hf_**** +``` + +### Specify cache directory +If not using --local-dir, all files will be downloaded by default to the cache directory defined by the HF_HOME environment variable. You can specify a custom cache using --cache-dir: + +```bash +(main) $> gandlf hf --download --repo-id distilbert-base-uncased revision --revision v1.1 --token hf_**** --cache-dir ./path/to/cache +``` + +### Download to a local folder +The recommended (and default) way to download files from the Hub is to use the cache-system. However, in some cases you want to download files and move them to a specific folder. This is useful to get a workflow closer to what git commands offer. You can do that using the --local-dir option. + +A ./huggingface/ folder is created at the root of your local directory containing metadata about the downloaded files. This prevents re-downloading files if they’re already up-to-date. If the metadata has changed, then the new file version is downloaded. This makes the local-dir optimized for pulling only the latest changes. + +```bash +(main) $> gandlf hf --download --repo-id distilbert-base-uncased revision --revision v1.1 --token hf_**** --cache-dir ./path/to/cache --local-dir ./path/to/dir +``` +### Force Download +To specify if the files should be downloaded even if it already exists in the local cache. + +```bash +(main) $> gandlf hf --download --repo-id distilbert-base-uncased revision --revision v1.1 --token hf_**** --cache-dir ./path/to/cache --local-dir ./path/to/dir --force-download +``` + +### Upload an entire folder +Use the `gandlf hf --upload` upload command to upload files to the Hub directly. + +```bash +(main) $> gandlf hf --upload --repo-id Wauplin/my-cool-model --folder-path ./model --token hf_**** +``` + +### Upload to a Specific Path in Repo +Relative path of the directory in the repo. Will default to the root folder of the repository. +```bash +(main) $> gandlf hf --upload --repo-id Wauplin/my-cool-model --folder-path ./model/data --path-in-repo ./data --token hf_**** +``` + +### Upload multiple files +To upload multiple files from a folder at once without uploading the entire folder, use the --allow-patterns and --ignore-patterns patterns. It can also be combined with the --delete-patterns option to delete files on the repo while uploading new ones. In the example below, we sync the local Space by deleting remote files and uploading all files except the ones in /logs: + +```bash +(main) $> gandlf hf Wauplin/space-example --repo-type=space --exclude="/logs/*" --delete="*" --commit-message="Sync local Space with Hub" +``` + +### Specify a token +To upload files, you must use a token. By default, the token saved locally will be used. If you want to authenticate explicitly, use the --token option: + +```bash +(main) $>gandlf hf --upload Wauplin/my-cool-model --folder-path ./model --token=hf_**** +``` + +### Specify a commit message +Use the --commit-message and --commit-description to set a custom message and description for your commit instead of the default one + +```bash +(main) $>gandlf hf --upload Wauplin/my-cool-model --folder-path ./model --token=hf_**** +--commit-message "Epoch 34/50" --commit-description="Val accuracy: 68%. Check tensorboard for more details." +``` + +### Upload to a dataset or Space +To upload to a dataset or a Space, use the --repo-type option: + +```bash +(main) $>gandlf hf --upload Wauplin/my-cool-model --folder-path ./model --token=hf_**** +--repo-type dataset +``` + +### Huggingface Template For Upload +#### Design and Modify Template +To design the huggingface template use the hugging_face.md file change the medatory field +[REQUIRED_FOR_GANDLF] to it's respective name don't leave it blank other wise it may through error, other field can be modeified by the user as per his convenience + +```bash +# Here the required field change from [REQUIRED_FOR_GANDLF] to [GANDLF] +**Developed by:** {{ developers | default("[GANDLF]", true)}} +``` +#### Mentioned The Huggingface Template +To mentioned the Huggingface Template , use the --hf-template option: + +```bash +(main) $>gandlf hf --upload Wauplin/my-cool-model --folder-path ./model --token=hf_**** +--repo-type dataset --hf-template /hugging_face.md + +``` diff --git a/setup.py b/setup.py index 07f747476..564df3045 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ "scikit-image>=0.19.1", "setuptools", "seaborn", - "pyyaml", + "pyyaml==6.0.1", "tiffslide", "matplotlib", "gdown==5.1.0", @@ -82,6 +82,7 @@ "typer==0.9.0", "colorlog", "opacus==1.5.2", + "huggingface-hub==0.25.1", ] if __name__ == "__main__": diff --git a/testing/entrypoints/test_hf_cli.py b/testing/entrypoints/test_hf_cli.py new file mode 100644 index 000000000..d28351022 --- /dev/null +++ b/testing/entrypoints/test_hf_cli.py @@ -0,0 +1,122 @@ +import os.path +import pytest +from click.testing import CliRunner + +from GANDLF.entrypoints.hf_hub_integration import new_way + +from . import CliCase, run_test_case, TmpDire + +# This function is a place where a real logic is executed. +# For tests, we replace it with mock up, and check if this function is called +# with proper args for different cli commands +MOCK_PATH = "GANDLF.entrypoints.hf_hub_integration.download_from_hub" + +# these files would be either created temporarily for test execution, +# or we ensure they do not exist +test_file_system = [TmpDire("./tmp_dir")] + + +test_cases = [ + CliCase( + should_succeed=True, + new_way_lines=[ + # full command + "--download --repo-id distilbert-base-uncased", + # tests short arg aliases + "-d -rid distilbert-base-uncased", + ], + expected_args={ + "repo_id": "distilbert-base-uncased", + "revision": None, + "cache_dir": None, + "local_dir": None, + "force_download": False, + "token": None, + }, + ), + CliCase( + should_succeed=True, + new_way_lines=[ + # full command + "--download --repo-id distilbert-base-uncased --revision 6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411", + # tests short arg aliases + "-d -rid distilbert-base-uncased -rv 6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411", + ], + expected_args={ + "repo_id": "distilbert-base-uncased", + "revision": "6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411", + "cache_dir": None, + "local_dir": None, + "force_download": False, + "token": None, + }, + ), + CliCase( + should_succeed=True, + new_way_lines=[ + # full command + "--download --repo-id distilbert-base-uncased --local-dir tmp_dir", + # tests short arg aliases + "-d -rid distilbert-base-uncased -ldir tmp_dir", + ], + expected_args={ + "repo_id": "distilbert-base-uncased", + "revision": None, + "cache_dir": None, + "local_dir": os.path.normpath("tmp_dir"), + "force_download": False, + "token": None, + }, + ), + CliCase( + should_succeed=False, + new_way_lines=[ + # full command + "--repo-id distilbert-base-uncased ", + # tests short arg aliases + "-rid distilbert-base-uncased -ldir", + ], + expected_args={ + "repo_id": "distilbert-base-uncased", + "revision": None, + "cache_dir": None, + "local_dir": None, + "force_download": False, + "token": None, + }, + ), + CliCase( + should_succeed=False, + new_way_lines=[ + # full command + "--download --repo-id distilbert-base-uncased --local-dir", + # tests short arg aliases + "-d -rid distilbert-base-uncased -ldir", + ], + expected_args={ + "repo_id": "distilbert-base-uncased", + "revision": None, + "cache_dir": None, + "local_dir": os.path.normpath("tmp_dir"), + "force_download": False, + "token": None, + }, + ), +] + + +@pytest.mark.parametrize("case", test_cases) +def test_case(cli_runner: CliRunner, case: CliCase): + """This approach ensures that before passing file_system_config to run_test_case, + you check its value and assign an appropriate default ([]).""" + file_system_config_ = test_file_system if test_file_system is not None else [] + + run_test_case( + cli_runner=cli_runner, + file_system_config=file_system_config_, # Default to empty list if no new_way_lines + case=case, + real_code_function_path=MOCK_PATH, + new_way=new_way, # Pass the real 'new_way' or set as needed + old_way=None, + old_script_name=None, + ) diff --git a/testing/hugging_face.md b/testing/hugging_face.md new file mode 100644 index 000000000..782a42746 --- /dev/null +++ b/testing/hugging_face.md @@ -0,0 +1,203 @@ +--- +# For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/modelcard.md?plain=1 +# Doc / guide: https://huggingface.co/docs/hub/model-cards +# Any fields required by GaNDLF are marked with REQUIRED_FOR_GANDLF, and will be checked +{{ card_data }} +--- + +# Model Card for {{ model_id | default("Model ID", true) }} + + + +{{ model_summary | default("", true) }} + +## Model Details + +### Model Description + + + +{{ model_description | default("", true) }} + +- **Developed by:** {{ developers | default("[GANDLF]", true)}} +- **License:** {{ license | default("[GANDLF]", true)}} +- **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}} +- **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}} +- **Model type:** {{ model_type | default("[More Information Needed]", true)}} +- **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}} +- **Finetuned from model [optional]:** {{ base_model | default("[More Information Needed]", true)}} +- **Primary Organization:** {{ primary_organization | default("[GANDLF]", true)}} +- **Commercial use policy:** {{ commercial_use | default("[GANDLF]", true)}} + +### Model Sources [optional] + + + +- **Repository:** {{ repo | default("[https://github.com/mlcommons/GaNDLF]", true)}} +- **Paper [optional]:** {{ paper | default("[More Information Needed]", true)}} +- **Demo [optional]:** {{ demo | default("[More Information Needed]", true)}} + +## Uses + + + +### Direct Use + + + +{{ direct_use | default("[More Information Needed]", true)}} + +### Downstream Use [optional] + + + +{{ downstream_use | default("[More Information Needed]", true)}} + +### Out-of-Scope Use + + + +{{ out_of_scope_use | default("[More Information Needed]", true)}} + +## Bias, Risks, and Limitations + + + +{{ bias_risks_limitations | default("[More Information Needed]", true)}} + +### Recommendations + + + +{{ bias_recommendations | default("Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.", true)}} + +## How to Get Started with the Model + +Use the code below to get started with the model. + +{{ get_started_code | default("[More Information Needed]", true)}} + +## Training Details + +### Training Data + + + +{{ training_data | default("[More Information Needed]", true)}} + +### Training Procedure + + + +#### Preprocessing [optional] + +{{ preprocessing | default("[More Information Needed]", true)}} + + +#### Training Hyperparameters + +- **Training regime:** {{ training_regime | default("[More Information Needed]", true)}} + +#### Speeds, Sizes, Times [optional] + + + +{{ speeds_sizes_times | default("[More Information Needed]", true)}} + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +{{ testing_data | default("[More Information Needed]", true)}} + +#### Factors + + + +{{ testing_factors | default("[More Information Needed]", true)}} + +#### Metrics + + + +{{ testing_metrics | default("[More Information Needed]", true)}} + +### Results + +{{ results | default("[More Information Needed]", true)}} + +#### Summary + +{{ results_summary | default("", true) }} + +## Model Examination [optional] + + + +{{ model_examination | default("[More Information Needed]", true)}} + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** {{ hardware_type | default("[More Information Needed]", true)}} +- **Hours used:** {{ hours_used | default("[More Information Needed]", true)}} +- **Cloud Provider:** {{ cloud_provider | default("[More Information Needed]", true)}} +- **Compute Region:** {{ cloud_region | default("[More Information Needed]", true)}} +- **Carbon Emitted:** {{ co2_emitted | default("[More Information Needed]", true)}} + +## Technical Specifications [optional] + +### Model Architecture and Objective + +{{ model_specs | default("[More Information Needed]", true)}} + +### Compute Infrastructure + +{{ compute_infrastructure | default("[More Information Needed]", true)}} + +#### Hardware + +{{ hardware_requirements | default("[More Information Needed]", true)}} + +#### Software + +{{ software | default("[More Information Needed]", true)}} + +## Citation [optional] + + + +**BibTeX:** + +{{ citation_bibtex | default("[More Information Needed]", true)}} + +**APA:** + +{{ citation_apa | default("[More Information Needed]", true)}} + +## Glossary [optional] + + + +{{ glossary | default("[More Information Needed]", true)}} + +## More Information [optional] + +{{ more_information | default("[More Information Needed]", true)}} + +## Model Card Authors [optional] + +{{ model_card_authors | default("[More Information Needed]", true)}} + +## Model Card Contact + +{{ model_card_contact | default("[More Information Needed]", true)}} diff --git a/testing/test_full.py b/testing/test_full.py index f78928ef5..50b628e76 100644 --- a/testing/test_full.py +++ b/testing/test_full.py @@ -35,6 +35,7 @@ generate_metrics_dict, split_data_and_save_csvs, ) +from GANDLF.cli.huggingface_hub_handler import push_to_model_hub, download_from_hub from GANDLF.schedulers import global_schedulers_dict from GANDLF.optimizers import global_optimizer_dict from GANDLF.models import global_models_dict @@ -46,6 +47,7 @@ ) from GANDLF.anonymize import run_anonymizer from GANDLF.entrypoints.debug_info import _debug_info +from huggingface_hub import HfApi device = "cpu" @@ -3181,8 +3183,102 @@ def test_generic_data_split(): print("passed") +def test_upload_download_huggingface(device): + print("52: Starting huggingface upload download tests") + # overwrite previous results + sanitize_outputDir() + output_dir_patches = os.path.join(outputDir, "histo_patches") + if os.path.isdir(output_dir_patches): + shutil.rmtree(output_dir_patches) + Path(output_dir_patches).mkdir(parents=True, exist_ok=True) + output_dir_patches_output = os.path.join(output_dir_patches, "histo_patches_output") + Path(output_dir_patches_output).mkdir(parents=True, exist_ok=True) + + parameters_patch = {} + # extracting minimal number of patches to ensure that the test does not take too long + parameters_patch["num_patches"] = 10 + parameters_patch["read_type"] = "sequential" + # define patches to be extracted in terms of microns + parameters_patch["patch_size"] = ["1000m", "1000m"] + + file_config_temp = write_temp_config_path(parameters_patch) + + patch_extraction( + inputDir + "/train_2d_histo_segmentation.csv", + output_dir_patches_output, + file_config_temp, + ) + + file_for_Training = os.path.join(output_dir_patches_output, "opm_train.csv") + # read and parse csv + parameters = ConfigManager( + testingDir + "/config_segmentation.yaml", version_check_flag=False + ) + training_data, parameters["headers"] = parseTrainingCSV(file_for_Training) + parameters["patch_size"] = patch_size["2D"] + parameters["modality"] = "histo" + parameters["model"]["dimension"] = 2 + parameters["model"]["class_list"] = [0, 255] + parameters["model"]["amp"] = True + parameters["model"]["num_channels"] = 3 + parameters = populate_header_in_parameters(parameters, parameters["headers"]) + parameters["model"]["architecture"] = "resunet" + parameters["nested_training"]["testing"] = 1 + parameters["nested_training"]["validation"] = -2 + parameters["metrics"] = ["dice"] + parameters["model"]["onnx_export"] = True + parameters["model"]["print_summary"] = True + parameters["data_preprocessing"]["resize_image"] = [128, 128] + modelDir = os.path.join(outputDir, "modelDir") + Path(modelDir).mkdir(parents=True, exist_ok=True) + TrainingManager( + dataframe=training_data, + outputDir=modelDir, + parameters=parameters, + device=device, + resume=False, + reset=True, + ) + inference_data, parameters["headers"] = parseTrainingCSV( + inputDir + "/train_2d_histo_segmentation.csv", train=False + ) + + inference_data.drop(index=inference_data.index[-1], axis=0, inplace=True) + InferenceManager( + dataframe=inference_data, + modelDir=modelDir, + parameters=parameters, + device=device, + ) + + # Initialize the Hugging Face API instance + api = HfApi(token="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") + try: + api.create_repo(repo_id="Ritesh43/ndlf_model") + except Exception as e: + print(e) + # Upload the Model to Huggingface Hub + push_to_model_hub( + repo_id="Ritesh43/ndlf_model", + folder_path=modelDir, + hf_template=testingDir + "/hugging_face.md", + token="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", + ) + # Download the Model from Huggingface Hub + download_from_hub(repo_id="Ritesh43/ndlf_model", local_dir=modelDir) + + api.delete_repo(repo_id="Ritesh43/ndlf_model") + + sanitize_outputDir() + # Download the Model from Huggingface Hub + # download_from_hub(repo_id="Ritesh43/Gandlf_new_pr", local_dir=modelDir) + + # sanitize_outputDir() + print("passed") + + def test_generic_logging(capsys): - print("52: Starting test for logging") + print("53: Starting test for logging") log_file = "testing/gandlf.log" logger_setup(log_file) message = "Testing logging" @@ -3230,7 +3326,7 @@ def test_generic_logging(capsys): def test_generic_debug_info(): - print("53: Starting test for logging") + print("54: Starting test for logging") _debug_info(True) print("passed")