Skip to content

Commit

Permalink
Add and update commit frequency metrics (#173)
Browse files Browse the repository at this point in the history
* add and update commit frequency metrics

* docs formatting

* correct the counting of files

* fix cli test

* update count_files

Co-Authored-By: Faisal Alquaddoomi <[email protected]>

* include commit date unified w/ files in test repos

Co-Authored-By: Faisal Alquaddoomi <[email protected]>

* fix docs test

* linting

---------

Co-authored-by: Faisal Alquaddoomi <[email protected]>
  • Loading branch information
d33bs and falquaddoomi authored Nov 21, 2024
1 parent c6cc135 commit c06376b
Show file tree
Hide file tree
Showing 10 changed files with 473 additions and 166 deletions.
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ root = "."
[tool.ruff]
target-version = "py311"
fix = true

lint.select = [
# mccabe
"C90",
Expand Down
43 changes: 31 additions & 12 deletions src/almanack/git.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pathlib
import tempfile
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union

import pygit2
from charset_normalizer import from_bytes
Expand Down Expand Up @@ -42,9 +42,8 @@ def get_commits(repo: pygit2.Repository) -> List[pygit2.Commit]:
# Get the latest commit (HEAD) from the repository
head = repo.revparse_single("HEAD")
# Create a walker to iterate over commits starting from the HEAD
walker = repo.walk(
head.id, pygit2.enums.SortMode.NONE
) # SortMode.NONE traverses commits in natural order; no sorting applied.
# sorting by time.
walker = repo.walk(head.id, pygit2.GIT_SORT_TIME)
# Collect all commits from the walker into a list
commits = list(walker)
return commits
Expand Down Expand Up @@ -147,14 +146,6 @@ def get_most_recent_commits(repo_path: pathlib.Path) -> tuple[str, str]:
return str(source_commit.id), str(target_commit.id)


"""
Module for handling various tasks with git repo blobs.
"""


import pygit2


def detect_encoding(blob_data: bytes) -> str:
"""
Detect the encoding of the given blob data using charset-normalizer.
Expand Down Expand Up @@ -227,6 +218,34 @@ def find_file(
return found_entry


def count_files(tree: Union[pygit2.Tree, pygit2.Blob]) -> int:
"""
Counts all files (Blobs) within a Git tree, including files
in subdirectories.
This function recursively traverses the provided `tree`
object to count each file, represented as a `pygit2.Blob`,
within the tree and any nested subdirectories.
Args:
tree (Union[pygit2.Tree, pygit2.Blob]):
The Git tree object (of type `pygit2.Tree`)
to traverse and count files. The initial call
should be made with the root tree of a commit.
Returns:
int:
The total count of files (Blobs) within the tree,
including nested files in subdirectories.
"""
if isinstance(tree, pygit2.Blob):
# Directly return 1 if the input is a Blob
return 1
elif isinstance(tree, pygit2.Tree):
# Recursively count files for Tree
return sum(count_files(entry) for entry in tree)


def read_file(
repo: pygit2.Repository,
entry: Optional[pygit2.Object] = None,
Expand Down
181 changes: 112 additions & 69 deletions src/almanack/metrics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
import pygit2
import yaml

from ..git import clone_repository, find_file, get_commits, get_edited_files, read_file
from ..git import (
clone_repository,
count_files,
find_file,
get_commits,
get_edited_files,
read_file,
)
from .entropy.calculate_entropy import (
calculate_aggregate_entropy,
calculate_normalized_entropy,
Expand Down Expand Up @@ -195,6 +202,41 @@ def default_branch_is_not_master(repo: pygit2.Repository) -> bool:
return repo.head.shorthand != "master"


def days_of_development(repo: pygit2.Repository) -> float:
"""
Args:
repo (pygit2.Repository): Path to the git repository.
Returns:
float: The average number of commits per day over the period of time.
"""
try:
# Try to get the HEAD commit. If it raises an error, there are no commits.
repo.revparse_single("HEAD")
except KeyError:
# If HEAD doesn't exist (repo is empty), return 0 commits.
return 0

# Traverse the commit history and collect commit dates
commit_dates = [
datetime.fromtimestamp(commit.commit_time).date()
for commit in repo.walk(repo.head.target, pygit2.GIT_SORT_TIME)
]

# If no commits, return 0
if not commit_dates:
return 0

# Calculate the number of days between the first and last commit
# +1 to include the first day
total_days = (max(commit_dates) - min(commit_dates)).days + 1

# Return the average commits per day
return total_days


def includes_common_docs(repo: pygit2.Repository) -> bool:
"""
Check whether the repo includes common documentation files and directories
Expand Down Expand Up @@ -244,74 +286,75 @@ def compute_repo_data(repo_path: str) -> None:
Returns:
dict: A dictionary containing data key-pairs.
"""
try:
# Convert repo_path to an absolute path and initialize the repository
repo_path = pathlib.Path(repo_path).resolve()
repo = pygit2.Repository(str(repo_path))

# Retrieve the list of commits from the repository
commits = get_commits(repo)
most_recent_commit = commits[0]
first_commit = commits[-1]

# Get a list of files that have been edited between the first and most recent commit
file_names = get_edited_files(repo, first_commit, most_recent_commit)

# Calculate the normalized total entropy for the repository
normalized_total_entropy = calculate_aggregate_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
file_names,
)

# Calculate the normalized entropy for the changes between the first and most recent commits
file_entropy = calculate_normalized_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
file_names,
)
# Convert commit times to UTC datetime objects, then format as date strings.
first_commit_date, most_recent_commit_date = (
datetime.fromtimestamp(commit.commit_time, tz=timezone.utc)
.date()
.isoformat()
for commit in (first_commit, most_recent_commit)
)

# Return the data structure
return {
"repo-path": str(repo_path),
"repo-commits": len(commits),
"repo-file-count": len(file_names),
"repo-commit-time-range": (first_commit_date, most_recent_commit_date),
"repo-includes-readme": file_exists_in_repo(
repo=repo,
expected_file_name="readme",
),
"repo-includes-contributing": file_exists_in_repo(
repo=repo,
expected_file_name="contributing",
),
"repo-includes-code-of-conduct": file_exists_in_repo(
repo=repo,
expected_file_name="code_of_conduct",
),
"repo-includes-license": file_exists_in_repo(
repo=repo,
expected_file_name="license",
),
"repo-is-citable": is_citable(repo=repo),
"repo-default-branch-not-master": default_branch_is_not_master(repo=repo),
"repo-includes-common-docs": includes_common_docs(repo=repo),
"repo-agg-info-entropy": normalized_total_entropy,
"repo-file-info-entropy": file_entropy,
}

except Exception as e:
# If processing fails, return an error dictionary
return {"repo_path": str(repo_path), "error": str(e)}
# Convert repo_path to an absolute path and initialize the repository
repo_path = pathlib.Path(repo_path).resolve()
repo = pygit2.Repository(str(repo_path))

# Retrieve the list of commits from the repository
commits = get_commits(repo)
most_recent_commit = commits[0]
first_commit = commits[-1]

# Get a list of files that have been edited between the first and most recent commit
edited_file_names = get_edited_files(repo, first_commit, most_recent_commit)

# Calculate the normalized total entropy for the repository
normalized_total_entropy = calculate_aggregate_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
edited_file_names,
)

# Calculate the normalized entropy for the changes between the first and most recent commits
file_entropy = calculate_normalized_entropy(
repo_path,
str(first_commit.id),
str(most_recent_commit.id),
edited_file_names,
)
# Convert commit times to UTC datetime objects, then format as date strings.
first_commit_date, most_recent_commit_date = (
datetime.fromtimestamp(commit.commit_time).date()
for commit in (first_commit, most_recent_commit)
)

# Return the data structure
return {
"repo-path": str(repo_path),
"repo-commits": (commits_count := len(commits)),
"repo-file-count": count_files(tree=most_recent_commit.tree),
"repo-commit-time-range": (
first_commit_date.isoformat(),
most_recent_commit_date.isoformat(),
),
"repo-days-of-development": (
days_of_development := (most_recent_commit_date - first_commit_date).days
+ 1
),
"repo-commits-per-day": commits_count / days_of_development,
"repo-includes-readme": file_exists_in_repo(
repo=repo,
expected_file_name="readme",
),
"repo-includes-contributing": file_exists_in_repo(
repo=repo,
expected_file_name="contributing",
),
"repo-includes-code-of-conduct": file_exists_in_repo(
repo=repo,
expected_file_name="code_of_conduct",
),
"repo-includes-license": file_exists_in_repo(
repo=repo,
expected_file_name="license",
),
"repo-is-citable": is_citable(repo=repo),
"repo-default-branch-not-master": default_branch_is_not_master(repo=repo),
"repo-includes-common-docs": includes_common_docs(repo=repo),
"repo-agg-info-entropy": normalized_total_entropy,
"repo-file-info-entropy": file_entropy,
}


def compute_pr_data(repo_path: str, pr_branch: str, main_branch: str) -> Dict[str, Any]:
Expand Down
12 changes: 12 additions & 0 deletions src/almanack/metrics/metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ metrics:
result-type: "tuple"
description: >-
Starting commit and most recent commit for the repository.
- name: "repo-days-of-development"
id: "SGA-META-0005"
result-type: "int"
description: >-
Integer representing the number of days of development
between most recent commit and first commit.
- name: "repo-commits-per-day"
id: "SGA-META-0006"
result-type: "float"
description: >-
Floating point number which represents the number of commits
per day (using days of development).
- name: "repo-includes-readme"
id: "SGA-GL-0001"
result-type: "bool"
Expand Down
16 changes: 10 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,16 @@ def community_health_repository_path(tmp_path_factory):

yield repo_setup(
repo_path=pathlib.Path(temp_dir),
files={
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
},
files=[
{
"files": {
"README.md": "# This is an example readme\n\nWelcome to our repo!",
"CONTRIBUTING.md": "# This is a stub for a CONTRIBUTING.md",
"CODE_OF_CONDUCT.md": "# This is a stub for a CODE_OF_CONDUCT.md",
"LICENSE.txt": "This is an example LICENSE file.",
}
}
],
)


Expand Down
Loading

0 comments on commit c06376b

Please sign in to comment.