Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: start dotnet feature extraction #958

Merged
merged 25 commits into from
Apr 8, 2022
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
574a30d
adding instruction feature extractors
mike-hunhoff Apr 5, 2022
6947497
adding support to parse imports
mike-hunhoff Apr 5, 2022
e3c749d
move API name normalization to helper function
mike-hunhoff Apr 6, 2022
efd8b30
adding dnfile feature extractor
mike-hunhoff Apr 6, 2022
0499f9e
Merge branch 'dotnet-main' into dotnet-extract
mike-hunhoff Apr 6, 2022
656776f
dotnet feature extractor cleanup
mike-hunhoff Apr 6, 2022
f9f5b29
adding guard rails to #US stream reads
mike-hunhoff Apr 7, 2022
6cd5f27
update function names to get un/managed dotnet imports
mike-hunhoff Apr 7, 2022
ab8384a
use dnfile_ extractor for file format
mike-hunhoff Apr 7, 2022
a921b83
PR updateS
mike-hunhoff Apr 7, 2022
1af95e5
Merge branch 'dotnet-main' into dotnet-extract
mike-hunhoff Apr 7, 2022
45392c5
add debug message for MethodBodyFormat errors
mike-hunhoff Apr 7, 2022
0a5d99a
additional typing
mike-hunhoff Apr 7, 2022
6cfaccc
removing get_class_import_name
mike-hunhoff Apr 7, 2022
c2c54d3
reuse code from dnfile_ to extract file imports
mike-hunhoff Apr 8, 2022
30c599a
rename generate_dotnet_token to be more descriptive
mike-hunhoff Apr 8, 2022
48b5abd
renaming get_dotnet_methods to be more descriptive
mike-hunhoff Apr 8, 2022
6c499df
adding debug message for string decode errors
mike-hunhoff Apr 8, 2022
61366fc
updating logging messages
mike-hunhoff Apr 8, 2022
0d67a6a
Merge branch 'dotnet-main' into dotnet-extract
mike-hunhoff Apr 8, 2022
21a35da
rename dnfile_ to dotnetfile
mike-hunhoff Apr 8, 2022
7509bb7
adding tests
mike-hunhoff Apr 8, 2022
60214d2
updating file headers
mike-hunhoff Apr 8, 2022
2941050
adding mypy ignore
mike-hunhoff Apr 8, 2022
48b8110
fix typo in test file
mike-hunhoff Apr 8, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/mypy/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,6 @@ ignore_missing_imports = True

[mypy-elftools.*]
ignore_missing_imports = True

[mypy-dncil.*]
ignore_missing_imports = True
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
30 changes: 22 additions & 8 deletions capa/features/extractors/dnfile_.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,40 @@
import logging
from typing import Tuple, Iterator
from itertools import chain

import dnfile
import pefile

import capa.features.extractors.helpers
from capa.features.file import Import
from capa.features.common import OS, OS_ANY, ARCH_ANY, ARCH_I386, ARCH_AMD64, FORMAT_DOTNET, Arch, Format, Feature
from capa.features.extractors.base_extractor import FeatureExtractor
from capa.features.extractors.dotnet.helpers import get_dotnet_managed_imports, get_dotnet_unmanaged_imports

logger = logging.getLogger(__name__)


def extract_file_format(**kwargs):
def extract_file_format(**kwargs) -> Iterator[Tuple[Format, int]]:
yield Format(FORMAT_DOTNET), 0x0


def extract_file_os(**kwargs):
def extract_file_import_names(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Import, int]]:
for (token, imp) in chain(get_dotnet_managed_imports(pe), get_dotnet_unmanaged_imports(pe)):
if "::" in imp:
# like System.IO.File::OpenRead
yield Import(imp), token
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
else:
# like kernel32.CreateFileA
dll, _, symbol = imp.rpartition(".")
for symbol_variant in capa.features.extractors.helpers.generate_symbols(dll, symbol):
yield Import(symbol_variant), token
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved


def extract_file_os(**kwargs) -> Iterator[Tuple[OS, int]]:
yield OS(OS_ANY), 0x0


def extract_file_arch(pe, **kwargs):
def extract_file_arch(pe: dnfile.dnPE, **kwargs) -> Iterator[Tuple[Arch, int]]:
# to distinguish in more detail, see https://stackoverflow.com/a/23614024/10548020
# .NET 4.5 added option: any CPU, 32-bit preferred
if pe.net.Flags.CLR_32BITREQUIRED and pe.PE_TYPE == pefile.OPTIONAL_HEADER_MAGIC_PE:
Expand All @@ -36,11 +52,9 @@ def extract_file_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]:


FILE_HANDLERS = (
# extract_file_export_names,
# extract_file_import_names,
# extract_file_section_names,
# extract_file_strings,
# extract_file_function_names,
extract_file_import_names,
# TODO extract_file_strings,
# TODO extract_file_function_names,
extract_file_format,
)

Expand Down
Empty file.
62 changes: 62 additions & 0 deletions capa/features/extractors/dotnet/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, List, Tuple

if TYPE_CHECKING:
from capa.features.common import Feature

import dnfile

import capa.features.extractors
import capa.features.extractors.dotnet.file
import capa.features.extractors.dotnet.insn
from capa.features.extractors.base_extractor import FeatureExtractor
from capa.features.extractors.dotnet.helpers import get_dotnet_methods


class DnfileFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
super(DnfileFeatureExtractor, self).__init__()
self.pe: dnfile.dnPE = dnfile.dnPE(path)

# pre-compute these because we'll yield them at *every* scope.
self.global_features: List[Tuple[Feature, int]] = []
self.global_features.extend(capa.features.extractors.dnfile_.extract_file_os(pe=self.pe))
self.global_features.extend(capa.features.extractors.dnfile_.extract_file_arch(pe=self.pe))

def get_base_address(self):
return 0x0

def extract_global_features(self):
yield from self.global_features

def extract_file_features(self):
yield from capa.features.extractors.dotnet.file.extract_features(self.pe)

def get_functions(self):
# data structure shared across functions yielded here.
# useful for caching analysis relevant across a single workspace.
ctx = {}
ctx["pe"] = self.pe

for f in get_dotnet_methods(self.pe):
setattr(f, "ctx", ctx)
yield f
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved

def extract_function_features(self, f):
# TODO
yield from []

def get_basic_blocks(self, f):
# each dotnet method is considered 1 basic block
yield f
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved

def extract_basic_block_features(self, f, bb):
# we don't support basic block features
yield from []

def get_instructions(self, f, bb):
yield from f.instructions

def extract_insn_features(self, f, bb, insn):
yield from capa.features.extractors.dotnet.insn.extract_features(f, bb, insn)
45 changes: 45 additions & 0 deletions capa/features/extractors/dotnet/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from __future__ import annotations

from typing import TYPE_CHECKING, List, Tuple, Iterator
from itertools import chain

if TYPE_CHECKING:
import dnfile
from capa.features.common import Feature

import capa.features.extractors
import capa.features.extractors.helpers
from capa.features.file import Import
from capa.features.common import FORMAT_DOTNET, Format
from capa.features.extractors.dotnet.helpers import get_dotnet_managed_imports, get_dotnet_unmanaged_imports


def extract_file_import_names(pe: dnfile.dnPE) -> Iterator[Tuple[Import, int]]:
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
"""extract file imports"""
for (token, imp) in chain(get_dotnet_managed_imports(pe), get_dotnet_unmanaged_imports(pe)):
if "::" in imp:
# like System.IO.File::OpenRead
yield Import(imp), token
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
else:
# like kernel32.CreateFileA
dll, _, symbol = imp.rpartition(".")
for symbol_variant in capa.features.extractors.helpers.generate_symbols(dll, symbol):
yield Import(symbol_variant), token
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved


def extract_file_format(pe: dnfile.dnPE) -> Iterator[Tuple[Format, int]]:
yield from capa.features.extractors.dnfile_.extract_file_format()


def extract_features(pe: dnfile.dnPE) -> Iterator[Tuple[Feature, int]]:
for file_handler in FILE_HANDLERS:
for (feature, token) in file_handler(pe):
yield feature, token


FILE_HANDLERS = (
extract_file_import_names,
# TODO extract_file_strings,
# TODO extract_file_function_names,
extract_file_format,
)
160 changes: 160 additions & 0 deletions capa/features/extractors/dotnet/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from __future__ import annotations

import logging
from typing import Any, Tuple, Iterator, Optional

import dnfile
from dncil.cil.body import CilMethodBody
from dncil.cil.error import MethodBodyFormatError
from dncil.clr.token import Token, StringToken, InvalidToken
from dncil.cil.body.reader import CilMethodBodyReaderBase

logger = logging.getLogger(__name__)

# key indexes to dotnet metadata tables
DOTNET_META_TABLES_BY_INDEX = {table.value: table.name for table in dnfile.enums.MetadataTables}


class DnfileMethodBodyReader(CilMethodBodyReaderBase):
def __init__(self, pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow):
self.pe: dnfile.dnPE = pe
self.offset: int = self.pe.get_offset_from_rva(row.Rva)
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved

def read(self, n: int) -> bytes:
data: bytes = self.pe.get_data(self.pe.get_rva_from_offset(self.offset), n)
self.offset += n
return data

def tell(self) -> int:
return self.offset

def seek(self, offset: int) -> int:
self.offset = offset
return self.offset


def generate_dotnet_token(table: int, rid: int) -> int:
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
return ((table & 0xFF) << Token.TABLE_SHIFT) | (rid & Token.RID_MASK)


def resolve_dotnet_token(pe: dnfile.dnPE, token: Token) -> Any:
"""map generic token to string or table row"""
if isinstance(token, StringToken):
user_string: Optional[str] = read_dotnet_user_string(pe, token)
if user_string is None:
return InvalidToken(token.value)
return user_string

table_name: str = DOTNET_META_TABLES_BY_INDEX.get(token.table, "")
if not table_name:
# table_index is not valid
return InvalidToken(token.value)

table: Any = getattr(pe.net.mdtables, table_name, None)
if table is None:
# table index is valid but table is not present
return InvalidToken(token.value)

try:
return table.rows[token.rid - 1]
except IndexError:
# table index is valid but row index is not valid
return InvalidToken(token.value)


def read_dotnet_method_body(pe: dnfile.dnPE, row: dnfile.mdtable.MethodDefRow) -> Optional[CilMethodBody]:
"""read dotnet method body"""
try:
return CilMethodBody(DnfileMethodBodyReader(pe, row))
except MethodBodyFormatError as e:
logger.warn("bad MethodDef row @ 0x%08x (%s)" % (row.Rva, e))
return None


def read_dotnet_user_string(pe: dnfile.dnPE, token: StringToken) -> Optional[str]:
"""read user string from #US stream"""
try:
user_string: Optional[dnfile.stream.UserString] = pe.net.user_strings.get_us(token.rid)
except UnicodeDecodeError:
return None
if user_string is None:
return None
return user_string.value


def get_dotnet_managed_imports(pe: dnfile.dnPE) -> Iterator[Tuple[int, str]]:
"""get managed imports from MemberRef table

see https://www.ntcore.com/files/dotnetformat.htm

10 - MemberRef Table
Each row represents an imported method
Class (index into the TypeRef, ModuleRef, MethodDef, TypeSpec or TypeDef tables)
Name (index into String heap)
01 - TypeRef Table
Each row represents an imported class, its namespace and the assembly which contains it
TypeName (index into String heap)
TypeNamespace (index into String heap)
"""
if not hasattr(pe.net.mdtables, "MemberRef"):
return

for (rid, row) in enumerate(pe.net.mdtables.MemberRef):
if not isinstance(row.Class.row, (dnfile.mdtable.TypeRefRow,)):
continue

token: int = generate_dotnet_token(dnfile.enums.MetadataTables.MemberRef.value, rid + 1)
# like System.IO.File::OpenRead
imp: str = f"{row.Class.row.TypeNamespace}.{row.Class.row.TypeName}::{row.Name}"

yield token, imp


def get_dotnet_unmanaged_imports(pe: dnfile.dnPE) -> Iterator[Tuple[int, str]]:
"""get unmanaged imports from ImplMap table

see https://www.ntcore.com/files/dotnetformat.htm

28 - ImplMap Table
ImplMap table holds information about unmanaged methods that can be reached from managed code, using PInvoke dispatch
MemberForwarded (index into the Field or MethodDef table; more precisely, a MemberForwarded coded index)
ImportName (index into the String heap)
ImportScope (index into the ModuleRef table)
"""
if not hasattr(pe.net.mdtables, "ImplMap"):
return

for row in pe.net.mdtables.ImplMap:
dll: str = row.ImportScope.row.Name
symbol: str = row.ImportName

# ECMA says "Each row of the ImplMap table associates a row in the MethodDef table (MemberForwarded) with the
# name of a routine (ImportName) in some unmanaged DLL (ImportScope)"; so we calculate and map the MemberForwarded
# MethodDef table token to help us later record native import method calls made from CIL
token: int = generate_dotnet_token(row.MemberForwarded.table.number, row.MemberForwarded.row_index)

# like Kernel32.dll
if dll and "." in dll:
dll = dll.split(".")[0]

# like kernel32.CreateFileA
imp: str = f"{dll}.{symbol}"

yield token, imp


def get_dotnet_methods(pe: dnfile.dnPE) -> Iterator[CilMethodBody]:
"""get managed methods from MethodDef table"""
if not hasattr(pe.net.mdtables, "MethodDef"):
return

for row in pe.net.mdtables.MethodDef:
if not row.ImplFlags.miIL or any((row.Flags.mdAbstract, row.Flags.mdPinvokeImpl)):
# skip methods that do not have a method body
mike-hunhoff marked this conversation as resolved.
Show resolved Hide resolved
continue

body: Optional[CilMethodBody] = read_dotnet_method_body(pe, row)
if body is None:
continue

yield body
Loading