diff --git a/pyproject.toml b/pyproject.toml index dad23507..b739c7e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,6 @@ root = "." [tool.ruff] target-version = "py311" fix = true - lint.select = [ # mccabe "C90", diff --git a/src/almanack/git.py b/src/almanack/git.py index 72c916dc..3c39f7b6 100644 --- a/src/almanack/git.py +++ b/src/almanack/git.py @@ -4,7 +4,7 @@ import pathlib import tempfile -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import pygit2 from charset_normalizer import from_bytes @@ -42,9 +42,8 @@ def get_commits(repo: pygit2.Repository) -> List[pygit2.Commit]: # Get the latest commit (HEAD) from the repository head = repo.revparse_single("HEAD") # Create a walker to iterate over commits starting from the HEAD - walker = repo.walk( - head.id, pygit2.enums.SortMode.NONE - ) # SortMode.NONE traverses commits in natural order; no sorting applied. + # sorting by time. + walker = repo.walk(head.id, pygit2.GIT_SORT_TIME) # Collect all commits from the walker into a list commits = list(walker) return commits @@ -147,14 +146,6 @@ def get_most_recent_commits(repo_path: pathlib.Path) -> tuple[str, str]: return str(source_commit.id), str(target_commit.id) -""" -Module for handling various tasks with git repo blobs. -""" - - -import pygit2 - - def detect_encoding(blob_data: bytes) -> str: """ Detect the encoding of the given blob data using charset-normalizer. @@ -227,6 +218,34 @@ def find_file( return found_entry +def count_files(tree: Union[pygit2.Tree, pygit2.Blob]) -> int: + """ + Counts all files (Blobs) within a Git tree, including files + in subdirectories. + + This function recursively traverses the provided `tree` + object to count each file, represented as a `pygit2.Blob`, + within the tree and any nested subdirectories. + + Args: + tree (Union[pygit2.Tree, pygit2.Blob]): + The Git tree object (of type `pygit2.Tree`) + to traverse and count files. The initial call + should be made with the root tree of a commit. + + Returns: + int: + The total count of files (Blobs) within the tree, + including nested files in subdirectories. + """ + if isinstance(tree, pygit2.Blob): + # Directly return 1 if the input is a Blob + return 1 + elif isinstance(tree, pygit2.Tree): + # Recursively count files for Tree + return sum(count_files(entry) for entry in tree) + + def read_file( repo: pygit2.Repository, entry: Optional[pygit2.Object] = None, diff --git a/src/almanack/metrics/data.py b/src/almanack/metrics/data.py index f62bf332..0986546a 100644 --- a/src/almanack/metrics/data.py +++ b/src/almanack/metrics/data.py @@ -11,7 +11,14 @@ import pygit2 import yaml -from ..git import clone_repository, find_file, get_commits, get_edited_files, read_file +from ..git import ( + clone_repository, + count_files, + find_file, + get_commits, + get_edited_files, + read_file, +) from .entropy.calculate_entropy import ( calculate_aggregate_entropy, calculate_normalized_entropy, @@ -195,6 +202,41 @@ def default_branch_is_not_master(repo: pygit2.Repository) -> bool: return repo.head.shorthand != "master" +def days_of_development(repo: pygit2.Repository) -> float: + """ + + + Args: + repo (pygit2.Repository): Path to the git repository. + + Returns: + float: The average number of commits per day over the period of time. + """ + try: + # Try to get the HEAD commit. If it raises an error, there are no commits. + repo.revparse_single("HEAD") + except KeyError: + # If HEAD doesn't exist (repo is empty), return 0 commits. + return 0 + + # Traverse the commit history and collect commit dates + commit_dates = [ + datetime.fromtimestamp(commit.commit_time).date() + for commit in repo.walk(repo.head.target, pygit2.GIT_SORT_TIME) + ] + + # If no commits, return 0 + if not commit_dates: + return 0 + + # Calculate the number of days between the first and last commit + # +1 to include the first day + total_days = (max(commit_dates) - min(commit_dates)).days + 1 + + # Return the average commits per day + return total_days + + def includes_common_docs(repo: pygit2.Repository) -> bool: """ Check whether the repo includes common documentation files and directories @@ -244,74 +286,75 @@ def compute_repo_data(repo_path: str) -> None: Returns: dict: A dictionary containing data key-pairs. """ - try: - # Convert repo_path to an absolute path and initialize the repository - repo_path = pathlib.Path(repo_path).resolve() - repo = pygit2.Repository(str(repo_path)) - - # Retrieve the list of commits from the repository - commits = get_commits(repo) - most_recent_commit = commits[0] - first_commit = commits[-1] - - # Get a list of files that have been edited between the first and most recent commit - file_names = get_edited_files(repo, first_commit, most_recent_commit) - - # Calculate the normalized total entropy for the repository - normalized_total_entropy = calculate_aggregate_entropy( - repo_path, - str(first_commit.id), - str(most_recent_commit.id), - file_names, - ) - - # Calculate the normalized entropy for the changes between the first and most recent commits - file_entropy = calculate_normalized_entropy( - repo_path, - str(first_commit.id), - str(most_recent_commit.id), - file_names, - ) - # Convert commit times to UTC datetime objects, then format as date strings. - first_commit_date, most_recent_commit_date = ( - datetime.fromtimestamp(commit.commit_time, tz=timezone.utc) - .date() - .isoformat() - for commit in (first_commit, most_recent_commit) - ) - - # Return the data structure - return { - "repo-path": str(repo_path), - "repo-commits": len(commits), - "repo-file-count": len(file_names), - "repo-commit-time-range": (first_commit_date, most_recent_commit_date), - "repo-includes-readme": file_exists_in_repo( - repo=repo, - expected_file_name="readme", - ), - "repo-includes-contributing": file_exists_in_repo( - repo=repo, - expected_file_name="contributing", - ), - "repo-includes-code-of-conduct": file_exists_in_repo( - repo=repo, - expected_file_name="code_of_conduct", - ), - "repo-includes-license": file_exists_in_repo( - repo=repo, - expected_file_name="license", - ), - "repo-is-citable": is_citable(repo=repo), - "repo-default-branch-not-master": default_branch_is_not_master(repo=repo), - "repo-includes-common-docs": includes_common_docs(repo=repo), - "repo-agg-info-entropy": normalized_total_entropy, - "repo-file-info-entropy": file_entropy, - } - - except Exception as e: - # If processing fails, return an error dictionary - return {"repo_path": str(repo_path), "error": str(e)} + # Convert repo_path to an absolute path and initialize the repository + repo_path = pathlib.Path(repo_path).resolve() + repo = pygit2.Repository(str(repo_path)) + + # Retrieve the list of commits from the repository + commits = get_commits(repo) + most_recent_commit = commits[0] + first_commit = commits[-1] + + # Get a list of files that have been edited between the first and most recent commit + edited_file_names = get_edited_files(repo, first_commit, most_recent_commit) + + # Calculate the normalized total entropy for the repository + normalized_total_entropy = calculate_aggregate_entropy( + repo_path, + str(first_commit.id), + str(most_recent_commit.id), + edited_file_names, + ) + + # Calculate the normalized entropy for the changes between the first and most recent commits + file_entropy = calculate_normalized_entropy( + repo_path, + str(first_commit.id), + str(most_recent_commit.id), + edited_file_names, + ) + # Convert commit times to UTC datetime objects, then format as date strings. + first_commit_date, most_recent_commit_date = ( + datetime.fromtimestamp(commit.commit_time).date() + for commit in (first_commit, most_recent_commit) + ) + + # Return the data structure + return { + "repo-path": str(repo_path), + "repo-commits": (commits_count := len(commits)), + "repo-file-count": count_files(tree=most_recent_commit.tree), + "repo-commit-time-range": ( + first_commit_date.isoformat(), + most_recent_commit_date.isoformat(), + ), + "repo-days-of-development": ( + days_of_development := (most_recent_commit_date - first_commit_date).days + + 1 + ), + "repo-commits-per-day": commits_count / days_of_development, + "repo-includes-readme": file_exists_in_repo( + repo=repo, + expected_file_name="readme", + ), + "repo-includes-contributing": file_exists_in_repo( + repo=repo, + expected_file_name="contributing", + ), + "repo-includes-code-of-conduct": file_exists_in_repo( + repo=repo, + expected_file_name="code_of_conduct", + ), + "repo-includes-license": file_exists_in_repo( + repo=repo, + expected_file_name="license", + ), + "repo-is-citable": is_citable(repo=repo), + "repo-default-branch-not-master": default_branch_is_not_master(repo=repo), + "repo-includes-common-docs": includes_common_docs(repo=repo), + "repo-agg-info-entropy": normalized_total_entropy, + "repo-file-info-entropy": file_entropy, + } def compute_pr_data(repo_path: str, pr_branch: str, main_branch: str) -> Dict[str, Any]: diff --git a/src/almanack/metrics/metrics.yml b/src/almanack/metrics/metrics.yml index 51e54d40..c070040b 100644 --- a/src/almanack/metrics/metrics.yml +++ b/src/almanack/metrics/metrics.yml @@ -20,6 +20,18 @@ metrics: result-type: "tuple" description: >- Starting commit and most recent commit for the repository. + - name: "repo-days-of-development" + id: "SGA-META-0005" + result-type: "int" + description: >- + Integer representing the number of days of development + between most recent commit and first commit. + - name: "repo-commits-per-day" + id: "SGA-META-0006" + result-type: "float" + description: >- + Floating point number which represents the number of commits + per day (using days of development). - name: "repo-includes-readme" id: "SGA-GL-0001" result-type: "bool" diff --git a/tests/conftest.py b/tests/conftest.py index 5989f2d2..666fe808 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -118,12 +118,16 @@ def community_health_repository_path(tmp_path_factory): yield repo_setup( repo_path=pathlib.Path(temp_dir), - files={ - "README.md": "# This is an example readme\n\nWelcome to our repo!", - "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md", - "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md", - "LICENSE.txt": "This is an example LICENSE file.", - }, + files=[ + { + "files": { + "README.md": "# This is an example readme\n\nWelcome to our repo!", + "CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md", + "CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md", + "LICENSE.txt": "This is an example LICENSE file.", + } + } + ], ) diff --git a/tests/data/almanack/repo_setup/create_repo.py b/tests/data/almanack/repo_setup/create_repo.py index ae7845b9..93dee53e 100644 --- a/tests/data/almanack/repo_setup/create_repo.py +++ b/tests/data/almanack/repo_setup/create_repo.py @@ -4,6 +4,7 @@ """ import pathlib +from datetime import datetime import pygit2 @@ -156,54 +157,93 @@ def create_entropy_repositories(base_path: pathlib.Path) -> None: def repo_setup( - repo_path: pathlib.Path, files: dict, branch_name: str = "main" + repo_path: pathlib.Path, + files: list[dict], + branch_name: str = "main", ) -> pygit2.Repository: """ - Set up a temporary repository with specified files. + Set up a temporary repository with specified files and commit dates. + Args: repo_path (Path): - The directory where the repo will be created. - files (dict): - A dictionary where keys are filenames and values are their content. + The temporary directory where the repo will be created. + files (list[dict]): + A list of dictionaries where each dictionary represents a commit. + Each dictionary must have: + - "files": A dictionary of filenames as keys and file content as values. + - "commit-date" (optional): The datetime of the commit. + If "commit-date" is not provided, the current date is used. + branch_name (str): - A string with the name of the branch which will be used for - committing changes. Defaults to "main". + The name of the branch to use for commits. Defaults to "main". + Returns: - pygit2.Repository: The initialized repository with files. + pygit2.Repository: + The initialized repository with the specified commits. """ - # Create a new repository in the specified path + # Initialize the repository repo = pygit2.init_repository(repo_path, bare=False) # Set user.name and user.email in the config set_repo_user_config(repo) - # Create nested files in the repository - for file_path, content in files.items(): - full_path = repo_path / file_path # Construct full path - full_path.parent.mkdir( - parents=True, exist_ok=True - ) # Create any parent directories - full_path.write_text(content) # Write the file content - - # Stage and commit the files - index = repo.index - index.add_all() - index.write() - - author = repo.default_signature - tree = repo.index.write_tree() - - # Commit the files - repo.create_commit( - f"refs/heads/{branch_name}", - author, - author, - "Initial commit with setup files", - tree, - [], - ) - - # Set the HEAD to point to the new branch - repo.set_head(f"refs/heads/{branch_name}") + branch_ref = f"refs/heads/{branch_name}" + parent_commit = None + + # Loop through each commit dictionary in `files` + for i, commit_data in enumerate(files): + # Extract commit files and commit date + commit_files = commit_data.get("files", {}) + commit_date = commit_data.get("commit-date", datetime.now()) + + # Create or update each file in the current commit + for filename, content in commit_files.items(): + file_path = repo_path / filename + file_path.parent.mkdir( + parents=True, exist_ok=True + ) # Ensure parent directories exist + file_path.write_text(content) + + # Stage all changes in the index + index = repo.index + index.add_all() + index.write() + + # Set the author and committer signatures with the specific commit date + author = pygit2.Signature( + repo.default_signature.name, + repo.default_signature.email, + int(commit_date.timestamp()), + ) + committer = author # Assuming the committer is the same as the author + + # Write the index to a tree + tree = index.write_tree() + + # Create the commit + commit_message = f"Commit #{i + 1} with files: {', '.join(commit_files.keys())}" + commit_id = repo.create_commit( + ( + branch_ref if i == 0 else None + ), # Set branch reference only for the first commit + author, + committer, + commit_message, + tree, + ( + [parent_commit.id] if parent_commit else [] + ), # Use the .id attribute to get the commit ID + ) + + # Update the parent_commit to the latest commit for chaining + parent_commit = repo.get( + commit_id + ) # Explicitly get the Commit object by its ID + + # Set the HEAD to the main branch after all commits + repo.set_head(branch_ref) + + # Ensure the HEAD is pointing to the last commit + repo.head.set_target(parent_commit.id) return repo diff --git a/tests/metrics/test_community_health.py b/tests/metrics/test_community_health.py deleted file mode 100644 index 928b94df..00000000 --- a/tests/metrics/test_community_health.py +++ /dev/null @@ -1,3 +0,0 @@ -""" -Tests various community health metric functionality. -""" diff --git a/tests/metrics/test_data.py b/tests/metrics/test_data.py index 57e69011..73738a0e 100644 --- a/tests/metrics/test_data.py +++ b/tests/metrics/test_data.py @@ -3,7 +3,8 @@ """ import pathlib -from typing import List +from datetime import datetime, timedelta +from typing import Dict, List import jsonschema import pandas as pd @@ -22,6 +23,8 @@ ) from tests.data.almanack.repo_setup.create_repo import repo_setup +DATETIME_NOW = datetime.now() + def test_generate_repo_data(entropy_repository_paths: dict[str, pathlib.Path]) -> None: """ @@ -167,42 +170,44 @@ def test_file_exists_in_repo( "files, expected", [ # Test with CITATION.cff - ({"CITATION.cff": "CITATION content."}, True), + ({"files": {"CITATION.cff": "CITATION content."}}, True), # Test with CITATION.bib - ({"CITATION.bib": "CITATION content."}, True), + ({"files": {"CITATION.bib": "CITATION content."}}, True), # Test citation sections in markdown format ( - {"readme.md": "## Citation\nThis is a citation."}, + {"files": {"readme.md": "## Citation\nThis is a citation."}}, True, ), ( - {"readme.md": "## Citing us\n\nHere's our awesome citation."}, + {"files": {"readme.md": "## Citing us\n\nHere's our awesome citation."}}, True, ), # RST scenarios - ({"README.md": "Citation\n--------"}, True), - ({"README.md": "Citing\n------"}, True), - ({"README.md": "Cite\n----"}, True), - ({"README.md": "How to cite\n-----------"}, True), + ({"files": {"README.md": "Citation\n--------"}}, True), + ({"files": {"README.md": "Citing\n------"}}, True), + ({"files": {"README.md": "Cite\n----"}}, True), + ({"files": {"README.md": "How to cite\n-----------"}}, True), # DOI badge ( { - "README.md": ( - "# Awesome project\n\n" - "[![DOI](https://img.shields.io/badge/DOI-10.48550/arXiv.2311.13417-blue)]" - "(https://doi.org/10.48550/arXiv.2311.13417)" - ), + "files": { + "README.md": ( + "# Awesome project\n\n" + "[![DOI](https://img.shields.io/badge/DOI-10.48550/arXiv.2311.13417-blue)]" + "(https://doi.org/10.48550/arXiv.2311.13417)" + ), + } }, True, ), - ({"README.md": "## How to cite"}, True), + ({"files": {"README.md": "## How to cite"}}, True), # Test with README without citation ( - {"readme.md": "This is a readme."}, + {"files": {"readme.md": "This is a readme."}}, False, ), # Test with no citation files - ({"random.txt": "Some random text."}, False), + ({"files": {"random.txt": "Some random text."}}, False), # test the almanack itseft as a special case (None, True), ], @@ -213,7 +218,7 @@ def test_is_citable(tmp_path, files, expected): """ if files is not None: - repo = repo_setup(repo_path=tmp_path, files=files) + repo = repo_setup(repo_path=tmp_path, files=[files]) else: # test the almanack itself repo_path = pathlib.Path(".").resolve() @@ -237,21 +242,27 @@ def test_default_branch_is_not_master(tmp_path): # test with a master branch repo = repo_setup( - repo_path=example1, files={"example.txt": "example"}, branch_name="master" + repo_path=example1, + files=[{"files": {"example.txt": "example"}}], + branch_name="master", ) assert not default_branch_is_not_master(repo) # test with a main branch repo = repo_setup( - repo_path=example2, files={"example.txt": "example"}, branch_name="main" + repo_path=example2, + files=[{"files": {"example.txt": "example"}}], + branch_name="main", ) assert default_branch_is_not_master(repo) # test with a simulated remote head pointed at remote master repo = repo_setup( - repo_path=example3, files={"example.txt": "example"}, branch_name="main" + repo_path=example3, + files=[{"files": {"example.txt": "example"}}], + branch_name="main", ) # simulate having a remote head pointed at a branch named master @@ -270,7 +281,9 @@ def test_default_branch_is_not_master(tmp_path): # test with a simulated remote head pointed at remote main repo = repo_setup( - repo_path=example4, files={"example.txt": "example"}, branch_name="main" + repo_path=example4, + files=[{"files": {"example.txt": "example"}}], + branch_name="main", ) # simulate having a remote head pointed at a branch named master @@ -289,7 +302,9 @@ def test_default_branch_is_not_master(tmp_path): # test with a simulated remote head pointed at remote main but with local branch master repo = repo_setup( - repo_path=example5, files={"example.txt": "example"}, branch_name="master" + repo_path=example5, + files=[{"files": {"example.txt": "example"}}], + branch_name="master", ) # simulate having a remote head pointed at a branch named master @@ -303,60 +318,174 @@ def test_default_branch_is_not_master(tmp_path): assert not default_branch_is_not_master(repo) +@pytest.mark.parametrize( + "files, expected_commits, expected_file_count, expected_days, expected_commits_per_day", + [ + # Single commit on a single day with one file + ([{"files": {"file1.txt": "content"}}], 1, 1, 1, 1.0), + # Two commits on the same day with two files + ( + [{"files": {"file1.txt": "content"}}, {"files": {"file2.txt": "content"}}], + 2, + 2, + 1, + 2.0, + ), + # Multiple commits over multiple days + ( + [ + { + "commit-date": DATETIME_NOW - timedelta(days=2), + "files": {"file1.txt": "content"}, + }, + { + "commit-date": DATETIME_NOW - timedelta(days=1), + "files": {"file2.txt": "content"}, + }, + {"commit-date": DATETIME_NOW, "files": {"file3.txt": "content"}}, + ], + 3, + 3, + 3, + 1.0, + ), + # Multiple commits on the same day with multiple files + ( + [ + {"commit-date": DATETIME_NOW, "files": {"file1.txt": "content"}}, + {"commit-date": DATETIME_NOW, "files": {"file2.txt": "new content"}}, + { + "commit-date": DATETIME_NOW, + "files": {"file3.txt": "another content"}, + }, + ], + 3, + 3, + 1, + 3.0, + ), + ], +) +# add noqa rule below to avoid warnings about too many parameters +def test_commit_frequency_data( # noqa: PLR0913 + tmp_path: pathlib.Path, + files: List[Dict[str, str]], + expected_commits: int, + expected_file_count: int, + expected_days: int, + expected_commits_per_day: float, +): + """ + Tests to ensure metric keys surrounding commits and commit frequency are + working as expected. + """ + # Setup the repository with the provided file structure and dates + repo_setup(repo_path=tmp_path, files=files) + + # Run the function to compute repo data + repo_data = compute_repo_data(str(tmp_path)) + + # Assertions for repo-commits + assert ( + repo_data["repo-commits"] == expected_commits + ), f"Expected {expected_commits} commits, got {repo_data['repo-commits']}" + + # Assertions for repo-file-count + assert ( + repo_data["repo-file-count"] == expected_file_count + ), f"Expected {expected_file_count} files, got {repo_data['repo-file-count']}" + + # Assertions for repo-commit-time-range + if "commit-date" in files[0].keys(): + first_date = files[0]["commit-date"].date().isoformat() + last_date = files[-1]["commit-date"].date().isoformat() + else: + today = DATETIME_NOW.date().isoformat() + first_date = last_date = today + assert repo_data["repo-commit-time-range"] == ( + first_date, + last_date, + ), f"Expected commit time range ({first_date}, {last_date}), got {repo_data['repo-commit-time-range']}" + + # Assertions for repo-days-of-development + assert ( + repo_data["repo-days-of-development"] == expected_days + ), f"Expected {expected_days} days of development, got {repo_data['repo-days-of-development']}" + + # Assertions for repo-commits-per-day + assert ( + repo_data["repo-commits-per-day"] == expected_commits_per_day + ), f"Expected {expected_commits_per_day} commits per day, got {repo_data['repo-commits-per-day']}" + + @pytest.mark.parametrize( "files, expected_result", [ # Scenario 1: `docs` directory with common documentation files ( { - "docs/mkdocs.yml": "site_name: Test Docs", - "docs/index.md": "# Welcome to the documentation", - "README.md": "# Project Overview", + "files": { + "docs/mkdocs.yml": "site_name: Test Docs", + "docs/index.md": "# Welcome to the documentation", + "README.md": "# Project Overview", + } }, True, ), # Scenario 2: `docs` directory without common documentation files ( { - "docs/random_file.txt": "This is just a random file", - "README.md": "# Project Overview", + "files": { + "docs/random_file.txt": "This is just a random file", + "README.md": "# Project Overview", + } }, False, ), # Scenario 3: No `docs` directory ( { - "README.md": "# Project Overview", - "src/main.py": "# Main script", + "files": { + "README.md": "# Project Overview", + "src/main.py": "# Main script", + } }, False, ), # Scenario 4: `docs` directory with misleading names ( { - "docs/mkdoc.yml": "Not a valid mkdocs file", - "docs/INDEX.md": "# Not a documentation index", + "files": { + "docs/mkdoc.yml": "Not a valid mkdocs file", + "docs/INDEX.md": "# Not a documentation index", + } }, False, ), # Scenario 5: `docs` directory with sphinx-like structure ( { - "docs/source/index.rst": "An rst index", + "files": { + "docs/source/index.rst": "An rst index", + } }, True, ), # Scenario 6: `docs` directory with sphinx-like structure ( { - "docs/source/index.md": "An md index", + "files": { + "docs/source/index.md": "An md index", + } }, True, ), # Scenario 6: `docs` directory with a readme under source dir ( { - "docs/source/readme.md": "A readme for nested docs", + "files": { + "docs/source/readme.md": "A readme for nested docs", + } }, True, ), @@ -369,7 +498,7 @@ def test_includes_common_docs(tmp_path, files, expected_result): Tests includes_common_docs """ if files is not None: - repo = repo_setup(repo_path=tmp_path, files=files) + repo = repo_setup(repo_path=tmp_path, files=[files]) else: # test the almanack itself repo_path = pathlib.Path(".").resolve() diff --git a/tests/test_cli.py b/tests/test_cli.py index cf06992f..41404607 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -28,7 +28,7 @@ def test_cli_almanack(tmp_path): """ # create a repo with a single file and commit - repo = repo_setup(repo_path=tmp_path, files={"example.txt": "example"}) + repo = repo_setup(repo_path=tmp_path, files=[{"files": {"example.txt": "example"}}]) # gather output and return code from running a CLI command stdout, _, returncode = run_cli_command(command=["almanack", repo.path]) diff --git a/tests/test_git.py b/tests/test_git.py index adb6fd61..24321062 100644 --- a/tests/test_git.py +++ b/tests/test_git.py @@ -3,13 +3,14 @@ """ import pathlib -from typing import Any +from typing import Any, Dict, List import pygit2 import pytest from almanack.git import ( clone_repository, + count_files, detect_encoding, find_file, get_commits, @@ -18,6 +19,7 @@ get_most_recent_commits, read_file, ) +from tests.data.almanack.repo_setup.create_repo import repo_setup def test_clone_repository(entropy_repository_paths: dict[str, Any]): @@ -163,3 +165,65 @@ def test_find_file_and_read_file( assert read_file_result_filepath == expected_content assert read_file_result_filepath == read_file_result_pygit_obj + + +@pytest.mark.parametrize( + "files, expected_count", + [ + # Test case: Single file at root + ([{"files": {"file1.txt": "content"}}], 1), + # Test case: Multiple files at root + ([{"files": {"file1.txt": "content", "file2.txt": "content"}}], 2), + # Test case: Files in nested directories + ( + [ + { + "files": { + "dir1/file1.txt": "content", + "dir1/dir2/file2.txt": "content", + } + } + ], + 2, + ), + # Test case: Empty repository (no files) + ([{"files": {}}], 0), + # Test case: Mixed root and nested files + ( + [ + { + "files": { + "file1.txt": "content", + "dir1/file2.txt": "content", + "dir1/dir2/file3.txt": "content", + } + } + ], + 3, + ), + ], +) +def test_count_files( + files: List[Dict[str, str]], expected_count: int, tmp_path: pathlib.Path +): + """ + Test the count_files function on various repository structures. + + Args: + files (List[Dict[str, str]]): A list of dictionaries where each dictionary represents a commit + and contains filenames as keys and file content as values. + expected_count (int): The expected number of files in the most recent commit tree. + tmp_path (pathlib.Path): Temporary directory path provided by pytest for testing. + """ + # Set up the test repository + repo_path = tmp_path / "test_repo" + repo = repo_setup(repo_path, files=files) + + # Get the most recent commit and its tree + most_recent_commit = next(repo.walk(repo.head.target, pygit2.GIT_SORT_TIME)) + most_recent_tree = most_recent_commit.tree + + # Run the count_files function and assert the file count matches the expected count + assert ( + count_files(most_recent_tree) == expected_count + ), f"Expected {expected_count} files, got {count_files(most_recent_tree)}"