Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Display analysis information #2134

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- render maec/* fields #843 @s-ff
- replace Halo spinner with Rich #2086 @s-ff
- optimize rule matching #2080 @williballenthin
- display analysis information to users #857 @s-ff

### Breaking Changes

Expand Down
7 changes: 2 additions & 5 deletions capa/capabilities/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, functi
file_features.update(function_features)

_, matches = ruleset.match(Scope.FILE, file_features, NO_ADDRESS)
return matches, len(file_features)
return matches, file_features


def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalone=True) -> bool:
def has_file_limitation(rules: RuleSet, capabilities: MatchResults) -> bool:
file_limitation_rules = list(filter(lambda r: r.is_file_limitation_rule(), rules.rules.values()))

for file_limitation_rule in file_limitation_rules:
Expand All @@ -51,9 +51,6 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon
for line in file_limitation_rule.meta.get("description", "").split("\n"):
logger.warning(" %s", line)
logger.warning(" Identified via rule: %s", file_limitation_rule.name)
if is_standalone:
logger.warning(" ")
logger.warning(" Use -v or -vv if you really want to see the capabilities identified by capa.")
logger.warning("-" * 80)

# bail on first file limitation
Expand Down
26 changes: 22 additions & 4 deletions capa/capabilities/static.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.helpers import redirecting_print_to_tqdm
from capa.features.file import Import
from capa.features.insn import API
from capa.capabilities.common import find_file_capabilities
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor

Expand Down Expand Up @@ -96,7 +98,7 @@ def find_basic_block_capabilities(

def find_code_capabilities(
ruleset: RuleSet, extractor: StaticFeatureExtractor, fh: FunctionHandle
) -> Tuple[MatchResults, MatchResults, MatchResults, int]:
) -> Tuple[MatchResults, MatchResults, MatchResults, FeatureSet]:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

changing the signature of a function is a breaking change, so this should wait until the next major release.

"""
find matches for the given rules within the given function.

Expand Down Expand Up @@ -129,7 +131,7 @@ def find_code_capabilities(
function_features[feature].add(va)

_, function_matches = ruleset.match(Scope.FUNCTION, function_features, fh.address)
return function_matches, bb_matches, insn_matches, len(function_features)
return function_matches, bb_matches, insn_matches, function_features


def find_static_capabilities(
Expand All @@ -141,6 +143,8 @@ def find_static_capabilities(

feature_counts = rdoc.StaticFeatureCounts(file=0, functions=())
library_functions: Tuple[rdoc.LibraryFunction, ...] = ()
apicall_count: int = 0
import_count: int = 0

assert isinstance(extractor, StaticFeatureExtractor)
with redirecting_print_to_tqdm(disable_progress):
Expand Down Expand Up @@ -180,12 +184,18 @@ def pbar(s, *args, **kwargs):
pb.set_postfix_str(f"skipped {n_libs} library functions ({percentage}%)")
continue

function_matches, bb_matches, insn_matches, feature_count = find_code_capabilities(
function_matches, bb_matches, insn_matches, function_features = find_code_capabilities(
ruleset, extractor, f
)
feature_count = len(function_features)
feature_counts.functions += (
rdoc.FunctionFeatureCount(address=frz.Address.from_capa(f.address), count=feature_count),
williballenthin marked this conversation as resolved.
Show resolved Hide resolved
)
# cumulatively count the total number of API calls
for feature, vas in function_features.items():
if isinstance(feature, API):
apicall_count += len(vas)

t1 = time.time()

match_count = 0
Expand Down Expand Up @@ -223,9 +233,15 @@ def pbar(s, *args, **kwargs):
rule = ruleset[rule_name]
capa.engine.index_rule_matches(function_and_lower_features, rule, locations)

all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, function_and_lower_features)
all_file_matches, file_features = find_file_capabilities(ruleset, extractor, function_and_lower_features)
feature_count = len(file_features)
feature_counts.file = feature_count

# cumulatively count the total number of Import features
for feature, _ in file_features.items():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use .keys() here to indicate that you won't use the value

if isinstance(feature, Import):
import_count += 1

matches: MatchResults = dict(
itertools.chain(
# each rule exists in exactly one scope,
Expand All @@ -241,6 +257,8 @@ def pbar(s, *args, **kwargs):
meta = {
"feature_counts": feature_counts,
"library_functions": library_functions,
"apicall_count": apicall_count,
"import_count": import_count,
}

return matches, meta
2 changes: 1 addition & 1 deletion capa/ghidra/capa_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def get_capabilities():

capabilities, counts = capa.capabilities.common.find_capabilities(rules, extractor, True)

if capa.capabilities.common.has_file_limitation(rules, capabilities, is_standalone=False):
if capa.capabilities.common.has_file_limitation(rules, capabilities):
popup("capa explorer encountered warnings during analysis. Please check the console output for more information.") # type: ignore [name-defined] # noqa: F821
logger.info("capa encountered warnings during analysis")

Expand Down
4 changes: 2 additions & 2 deletions capa/ghidra/capa_ghidra.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def run_headless():
meta.analysis.library_functions = counts["library_functions"]
meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)

if capa.capabilities.common.has_file_limitation(rules, capabilities, is_standalone=True):
if capa.capabilities.common.has_file_limitation(rules, capabilities):
logger.info("capa encountered warnings during analysis")

if args.json:
Expand Down Expand Up @@ -130,7 +130,7 @@ def run_ui():
meta.analysis.library_functions = counts["library_functions"]
meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)

if capa.capabilities.common.has_file_limitation(rules, capabilities, is_standalone=False):
if capa.capabilities.common.has_file_limitation(rules, capabilities):
logger.info("capa encountered warnings during analysis")

if verbose == "vverbose":
Expand Down
2 changes: 2 additions & 0 deletions capa/ghidra/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,5 +156,7 @@ def collect_metadata(rules: List[Path]):
),
feature_counts=rdoc.StaticFeatureCounts(file=0, functions=()),
library_functions=(),
apicall_count=0,
import_count=0,
),
)
2 changes: 2 additions & 0 deletions capa/ida/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,8 @@ def collect_metadata(rules: List[Path]):
# ignore these for now - not used by IDA plugin.
feature_counts=rdoc.StaticFeatureCounts(file=0, functions=()),
library_functions=(),
apicall_count=0,
import_count=0,
),
)

Expand Down
2 changes: 1 addition & 1 deletion capa/ida/plugin/form.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ def slot_progress_feature_extraction(text):

capa.ida.helpers.inform_user_ida_ui("capa encountered file type warnings during analysis")

if capa.capabilities.common.has_file_limitation(ruleset, capabilities, is_standalone=False):
if capa.capabilities.common.has_file_limitation(ruleset, capabilities):
capa.ida.helpers.inform_user_ida_ui("capa encountered file limitation warnings during analysis")
except Exception as e:
logger.exception("Failed to check for file limitations (error: %s)", e)
Expand Down
2 changes: 2 additions & 0 deletions capa/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts):
),
feature_counts=counts["feature_counts"],
library_functions=counts["library_functions"],
apicall_count=counts["apicall_count"],
import_count=counts["import_count"],
)
elif isinstance(extractor, DynamicFeatureExtractor):
return rdoc.DynamicAnalysis(
Expand Down
24 changes: 8 additions & 16 deletions capa/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
FORMAT_RESULT,
)
from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities
from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor
from capa.features.extractors.base_extractor import FeatureExtractor, DynamicFeatureExtractor

RULES_PATH_DEFAULT_STRING = "(embedded rules)"
SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)"
Expand Down Expand Up @@ -670,12 +670,6 @@ def find_file_limitations_from_cli(args, rules: RuleSet, file_extractors: List[F
# file limitations that rely on non-file scope won't be detected here.
# nor on FunctionName features, because pefile doesn't support this.
found_file_limitation = has_file_limitation(rules, pure_file_capabilities)
if found_file_limitation:
# bail if capa encountered file limitation e.g. a packed binary
# do show the output in verbose mode, though.
if not (args.verbose or args.vverbose or args.json):
logger.debug("file limitation short circuit, won't analyze fully.")
raise ShouldExitError(E_FILE_LIMITATION)
return found_file_limitation


Expand Down Expand Up @@ -804,7 +798,7 @@ def main(argv: Optional[List[str]] = None):
input_format = get_input_format_from_cli(args)
rules = get_rules_from_cli(args)
file_extractors = get_file_extractors_from_cli(args, input_format)
found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors)
_ = find_file_limitations_from_cli(args, rules, file_extractors)
except ShouldExitError as e:
return e.status_code

Expand Down Expand Up @@ -837,12 +831,6 @@ def main(argv: Optional[List[str]] = None):
meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts)
meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities)

if isinstance(extractor, StaticFeatureExtractor) and found_file_limitation:
# bail if capa's static feature extractor encountered file limitation e.g. a packed binary
# do show the output in verbose mode, though.
if not (args.verbose or args.vverbose or args.json):
return E_FILE_LIMITATION

if args.json:
print(capa.render.json.render(meta, rules, capabilities))
elif args.vverbose:
Expand Down Expand Up @@ -890,8 +878,10 @@ def ida_main():

meta.analysis.feature_counts = counts["feature_counts"]
meta.analysis.library_functions = counts["library_functions"]
meta.analysis.apicall_count = counts["apicall_count"]
meta.analysis.import_count = counts["import_count"]

if has_file_limitation(rules, capabilities, is_standalone=False):
if has_file_limitation(rules, capabilities):
capa.ida.helpers.inform_user_ida_ui("capa encountered warnings during analysis")

colorama.init(strip=True)
Expand Down Expand Up @@ -928,8 +918,10 @@ def ghidra_main():

meta.analysis.feature_counts = counts["feature_counts"]
meta.analysis.library_functions = counts["library_functions"]
meta.analysis.apicall_count = counts["apicall_count"]
meta.analysis.import_count = counts["import_count"]

if has_file_limitation(rules, capabilities, is_standalone=False):
if has_file_limitation(rules, capabilities):
logger.info("capa encountered warnings during analysis")

print(capa.render.default.render(meta, rules, capabilities))
Expand Down
24 changes: 24 additions & 0 deletions capa/render/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

tabulate.PRESERVE_WHITESPACE = True

MIN_LIBFUNCS_RATIO = 0.4
MIN_API_CALLS = 10
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where did these numbers come from? and how should i interpret them?

Copy link
Collaborator Author

@fariss fariss Jun 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. MIN_LIBFUNCS_RATIO: When the total count of library function present in a sample is less then 40%, we inform users that capa might pick false positive matches from other functions that would have been classified as library functions. I don't have any statistical data to back this up other than this hex-rays blogpost.
  2. MIN_API_CALLS: When the sample has very few API calls, it is a strong indication that it might be packed/encrypted as regular programs tend to make a lot more than 10 calls (though, we have to run a benchmark across multiple sample to decide what's a good number here). For example this packed capa-testfile emits 0 API features, luckily we detect that it is packed with UPX. If that weren't the case, this banner could serve as an indication that the sample might packed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good explanations!

would you include the key parts here as a comment?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also i'm interested to see how frequently this message is shown to users. I don't think our dogs will identify 40% of functions in most binaries, so i'm a little concerned this message will be shown too often.

have you had a chance to collect these stats against a large number of samples?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's still helpful information since we know there's most likely more library code than we've identified.



def width(s: str, character_count: int) -> str:
"""pad the given string to at least `character_count`"""
Expand All @@ -29,6 +32,27 @@ def width(s: str, character_count: int) -> str:


def render_meta(doc: rd.ResultDocument, ostream: StringIO):
if isinstance(doc.meta.analysis, rd.StaticAnalysis):

if doc.meta.analysis.apicall_count < MIN_API_CALLS:
ostream.write(
rutils.bold(
"The analyzed sample reports very few API calls, this could indicate that it is packed, encrypted, corrupted, or tiny\n"
)
)

n_libs: int = len(doc.meta.analysis.library_functions)
n_funcs: int = len(doc.meta.analysis.feature_counts.functions)
lib_ratio: float = n_libs / (n_funcs + n_libs) if (n_funcs + n_libs) > 0 else 0

if lib_ratio < MIN_LIBFUNCS_RATIO:
ostream.write(
rutils.bold(
"Few library functions (%.2f%% of all functions) recognized by FLIRT signatures, results may contain false positives\n"
)
% (100 * lib_ratio)
)

rows = [
(width("md5", 22), width(doc.meta.sample.md5, 82)),
("sha1", doc.meta.sample.sha1),
Expand Down
4 changes: 4 additions & 0 deletions capa/render/proto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ def static_analysis_to_pb2(analysis: rd.StaticAnalysis) -> capa_pb2.StaticAnalys
library_functions=[
capa_pb2.LibraryFunction(address=addr_to_pb2(lf.address), name=lf.name) for lf in analysis.library_functions
],
apicall_count=analysis.apicall_count,
import_count=analysis.import_count,
)


Expand Down Expand Up @@ -703,6 +705,8 @@ def static_analysis_from_pb2(analysis: capa_pb2.StaticAnalysis) -> rd.StaticAnal
library_functions=tuple(
[rd.LibraryFunction(address=addr_from_pb2(lf.address), name=lf.name) for lf in analysis.library_functions]
),
apicall_count=analysis.apicall_count,
import_count=analysis.import_count,
)


Expand Down
2 changes: 2 additions & 0 deletions capa/render/proto/capa.proto
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,8 @@ message StaticAnalysis {
StaticLayout layout = 7;
StaticFeatureCounts feature_counts = 8;
repeated LibraryFunction library_functions = 9;
uint64 apicall_count = 10;
uint64 import_count = 11;
}

message StaticFeatureCounts {
Expand Down
321 changes: 166 additions & 155 deletions capa/render/proto/capa_pb2.py

Large diffs are not rendered by default.

Loading
Loading