Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add metrics for unique and active contributor counts #182

Merged
merged 3 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 41 additions & 1 deletion src/almanack/metrics/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pathlib
import shutil
import tempfile
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
from typing import Any, Dict, List, Optional, Tuple

import pygit2
Expand All @@ -25,6 +25,7 @@
)

METRICS_TABLE = f"{pathlib.Path(__file__).parent!s}/metrics.yml"
DATETIME_NOW = datetime.now(timezone.utc)


def get_table(repo_path: str) -> List[Dict[str, Any]]:
Expand Down Expand Up @@ -276,6 +277,37 @@ def includes_common_docs(repo: pygit2.Repository) -> bool:
return False


def count_unique_contributors(
repo: pygit2.Repository, since: Optional[datetime] = None
) -> int:
"""
Counts the number of unique contributors to a repository.

If a `since` datetime is provided, counts contributors
who made commits after the specified datetime.
Otherwise, counts all contributors.

Args:
repo (pygit2.Repository):
The repository to analyze.
since (Optional[datetime]):
The cutoff datetime. Only contributions after
this datetime are counted. If None, all
contributions are considered.

Returns:
int:
The number of unique contributors.
"""
since_timestamp = since.timestamp() if since else 0
contributors = {
commit.author.email
for commit in repo.walk(repo.head.target, pygit2.GIT_SORT_TIME)
if commit.commit_time > since_timestamp
}
d33bs marked this conversation as resolved.
Show resolved Hide resolved
return len(contributors)


def compute_repo_data(repo_path: str) -> None:
"""
Computes comprehensive data for a GitHub repository.
Expand Down Expand Up @@ -333,6 +365,7 @@ def compute_repo_data(repo_path: str) -> None:
+ 1
),
"repo-commits-per-day": commits_count / days_of_development,
"almanack-table-datetime": DATETIME_NOW.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
"repo-includes-readme": file_exists_in_repo(
repo=repo,
expected_file_name="readme",
Expand All @@ -352,6 +385,13 @@ def compute_repo_data(repo_path: str) -> None:
"repo-is-citable": is_citable(repo=repo),
"repo-default-branch-not-master": default_branch_is_not_master(repo=repo),
"repo-includes-common-docs": includes_common_docs(repo=repo),
"repo-unique-contributors": count_unique_contributors(repo=repo),
"repo-unique-contributors-past-year": count_unique_contributors(
repo=repo, since=DATETIME_NOW - timedelta(days=365)
),
"repo-unique-contributors-past-182-days": count_unique_contributors(
repo=repo, since=DATETIME_NOW - timedelta(days=182)
),
"repo-agg-info-entropy": normalized_total_entropy,
"repo-file-info-entropy": file_entropy,
}
Expand Down
27 changes: 27 additions & 0 deletions src/almanack/metrics/metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ metrics:
description: >-
Floating point number which represents the number of commits
per day (using days of development).
- name: "almanack-table-datetime"
id: "SGA-META-0007"
result-type: "string"
description: >-
String representing the date when this table was generated
in the format of '%Y-%m-%dT%H:%M:%S.%fZ', for example:
2024-11-22T18:20:30.123456Z .
- name: "repo-includes-readme"
id: "SGA-GL-0001"
result-type: "bool"
Expand Down Expand Up @@ -75,6 +82,26 @@ metrics:
Boolean value indicating whether the repo includes
common documentation directory and files associated
with building docsites.
- name: "repo-unique-contributors"
id: "SGA-GL-0008"
result-type: "int"
description: >-
Count of unique contributors since the beginning
of the repository.
- name: "repo-unique-contributors-past-year"
id: "SGA-GL-0009"
result-type: "int"
description: >-
Count of unique contributors within the last year
from now (where now is a reference to table value
of almanack-table-datetime).
- name: "repo-unique-contributors-past-182-days"
id: "SGA-GL-0010"
result-type: "int"
description: >-
Count of unique contributors within the last 182 days
from now (where now is a reference to table value
of almanack-table-datetime).
- name: "repo-agg-info-entropy"
id: "SGA-VS-0001"
result-type: "float"
Expand Down
28 changes: 19 additions & 9 deletions tests/data/almanack/repo_setup/create_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def repo_setup(
branch_name: str = "main",
) -> pygit2.Repository:
"""
Set up a temporary repository with specified files and commit dates.
Set up a temporary repository with specified files and commit metadata.

Args:
repo_path (Path):
Expand All @@ -172,7 +172,9 @@ def repo_setup(
Each dictionary must have:
- "files": A dictionary of filenames as keys and file content as values.
- "commit-date" (optional): The datetime of the commit.
If "commit-date" is not provided, the current date is used.
- "author" (optional): A dictionary with "name" and "email" keys
to specify the commit author. If not provided, defaults to the
repository's default user configuration.

branch_name (str):
The name of the branch to use for commits. Defaults to "main".
Expand All @@ -192,9 +194,10 @@ def repo_setup(

# Loop through each commit dictionary in `files`
for i, commit_data in enumerate(files):
# Extract commit files and commit date
# Extract commit files and metadata
commit_files = commit_data.get("files", {})
commit_date = commit_data.get("commit-date", datetime.now())
author_data = commit_data.get("author", None)

# Create or update each file in the current commit
for filename, content in commit_files.items():
Expand All @@ -209,12 +212,19 @@ def repo_setup(
index.add_all()
index.write()

# Set the author and committer signatures with the specific commit date
author = pygit2.Signature(
repo.default_signature.name,
repo.default_signature.email,
int(commit_date.timestamp()),
)
# Determine the author and committer
if author_data:
author = pygit2.Signature(
author_data["name"],
author_data["email"],
int(commit_date.timestamp()),
)
else:
author = pygit2.Signature(
repo.default_signature.name,
repo.default_signature.email,
int(commit_date.timestamp()),
)
committer = author # Assuming the committer is the same as the author

# Write the index to a tree
Expand Down
86 changes: 85 additions & 1 deletion tests/metrics/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

import pathlib
from datetime import datetime, timedelta
from datetime import datetime, timedelta, timezone
from typing import Dict, List

import jsonschema
Expand All @@ -15,6 +15,7 @@
from almanack.metrics.data import (
METRICS_TABLE,
compute_repo_data,
count_unique_contributors,
default_branch_is_not_master,
file_exists_in_repo,
get_table,
Expand Down Expand Up @@ -506,3 +507,86 @@ def test_includes_common_docs(tmp_path, files, expected_result):

# Assert that the function returns the expected result
assert includes_common_docs(repo) == expected_result


@pytest.mark.parametrize(
"files, since, expected_count",
[
# Test case 1: All contributors since the beginning
(
[
{
"files": {"file1.txt": "Hello, world!"},
"author": {"name": "Alice", "email": "[email protected]"},
},
{
"files": {"file2.txt": "Another commit"},
"author": {"name": "Bob", "email": "[email protected]"},
},
{
"files": {"file3.txt": "Yet another commit"},
"author": {"name": "Alice", "email": "[email protected]"},
},
],
None, # since: All contributors
2, # Alice and Bob
),
# Test case 2: Contributors in the past 1 year
(
[
{
"files": {"file1.txt": "Recent commit"},
"commit-date": datetime.now(timezone.utc) - timedelta(days=200),
"author": {"name": "Alice", "email": "[email protected]"},
},
{
"files": {"file2.txt": "Old commit"},
"commit-date": datetime.now(timezone.utc) - timedelta(days=400),
"author": {"name": "Bob", "email": "[email protected]"},
},
{
"files": {"file3.txt": "Another recent commit"},
"commit-date": datetime.now(timezone.utc) - timedelta(days=100),
"author": {"name": "Charlie", "email": "[email protected]"},
},
],
datetime.now(timezone.utc) - timedelta(days=365), # since: 1 year ago
2, # Alice and Charlie
),
# Test case 3: Contributors in the past 182 days
(
[
{
"files": {"file1.txt": "Recent commit"},
"commit-date": datetime.now(timezone.utc) - timedelta(days=150),
"author": {"name": "Alice", "email": "[email protected]"},
},
{
"files": {"file2.txt": "Older commit"},
"commit-date": datetime.now(timezone.utc) - timedelta(days=400),
"author": {"name": "Bob", "email": "[email protected]"},
},
{
"files": {"file3.txt": "Another recent commit"},
"commit-date": datetime.now(timezone.utc) - timedelta(days=50),
"author": {"name": "Charlie", "email": "[email protected]"},
},
],
datetime.now(timezone.utc) - timedelta(days=182), # since: 182 days ago
2, # Alice and Charlie
),
],
)
def test_count_unique_contributors(tmp_path, files, since, expected_count):
"""
Test the count_unique_contributors function with various time frames and contributors.
"""
# Set up the repository
repo_path = tmp_path / "test_repo"
repo = repo_setup(repo_path, files)

# Test the count_unique_contributors function
result = count_unique_contributors(repo, since)

# Assert the result matches the expected count
assert result == expected_count, f"Expected {expected_count}, got {result}"