Skip to content

Commit

Permalink
adding instruction feature extractors
Browse files Browse the repository at this point in the history
  • Loading branch information
mike-hunhoff committed Apr 5, 2022
1 parent 3f16877 commit 574a30d
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .github/mypy/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,6 @@ ignore_missing_imports = True

[mypy-elftools.*]
ignore_missing_imports = True

[mypy-dncil.*]
ignore_missing_imports = True
Empty file.
71 changes: 71 additions & 0 deletions capa/features/extractors/dotnet/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from dnfile.mdtable import MemberRefRow
from dnfile.mdtable import MethodDefRow
from dnfile import dnPE

import dnfile
from dnfile.enums import MetadataTables
from dncil.cil.body import CilMethodBody
from dncil.clr.token import Token, InvalidToken
from dncil.cil.body.reader import CilMethodBodyReaderBase

# key indexes to dotnet metadata tables
DOTNET_META_TABLES_BY_INDEX = {table.value: table.name for table in MetadataTables}


class DnfileMethodBodyReader(CilMethodBodyReaderBase):
def __init__(self, pe: dnfile.dnPE, row: MethodDefRow):
""" """
self.pe = pe
self.rva = self.pe.get_offset_from_rva(row.Rva)

def read(self, n):
""" """
data = self.pe.get_data(self.pe.get_rva_from_offset(self.rva), n)
self.rva += n
return data

def tell(self):
""" """
return self.rva

def seek(self, rva):
""" """
self.rva = rva

def get_token(self, value, is_str=False):
""" """
token = Token(value)

if is_str:
return self.pe.net.user_strings.get_us(token.rid).value

table_name = DOTNET_META_TABLES_BY_INDEX.get(token.table, "")
if not table_name:
# table_index is not valid
return InvalidToken(token.value)

table = getattr(self.pe.net.mdtables, table_name, None)
if table is None:
# table index is valid but table is not present
return InvalidToken(token.value)

try:
return table.rows[token.rid - 1]
except IndexError:
# table index is valid but row index is not valid
return InvalidToken(token.value)


def read_dotnet_method_body(pe: dnPE, row: MethodDefRow) -> CilMethodBody:
""" """
return CilMethodBody(DnfileMethodBodyReader(pe, row))


def get_imported_class_name(row: MemberRefRow) -> str:
""" """
return f"{row.Class.row.TypeNamespace}.{row.Class.row.TypeName}"
99 changes: 99 additions & 0 deletions capa/features/extractors/dotnet/insn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from __future__ import annotations

from typing import TYPE_CHECKING, List, Tuple, Union, Callable, Generator

if TYPE_CHECKING:
from dncil.cil.instruction import Instruction
from dncil.cil.body import CilMethodBody

import dncil
import dnfile
from dncil.cil.error import MethodBodyFormatError
from dncil.cil.opcode import OpCodes

import capa.features.extractors.helpers
import capa.features.extractors.dotnet.helpers
from capa.features.insn import API, Number
from capa.features.common import String


def extract_insn_api_features(f: CilMethodBody, insn: Instruction) -> Generator[Tuple[API, int], None, None]:
"""parse instruction API features
see https://www.ntcore.com/files/dotnetformat.htm
10 - MemberRef Table
Each row represents an imported method.
Class (index into the TypeRef, ModuleRef, MethodDef, TypeSpec or TypeDef tables)
01 - TypeRef Table
Each row represents an imported class, its namespace and the assembly which contains it.
TypeName (index into String heap)
TypeNamespace (index into String heap)
"""
if insn.opcode in (OpCodes.Call, OpCodes.Callvirt, OpCodes.Jmp, OpCodes.Calli):
if isinstance(insn.operand, dnfile.mdtable.MemberRefRow):
if isinstance(insn.operand.Class.row, (dnfile.mdtable.TypeRefRow,)):
class_name = capa.features.extractors.dotnet.helpers.get_imported_class_name(insn.operand)
method_name = insn.operand.Name
yield API(f"{class_name}::{method_name}"), insn.offset


def extract_insn_number_features(f: CilMethodBody, insn: Instruction) -> Generator[Tuple[Number, int], None, None]:
"""parse instruction number features"""
if insn.is_ldc():
yield Number(insn.get_ldc()), insn.offset


def extract_insn_string_features(f: CilMethodBody, insn: Instruction) -> Generator[Tuple[String, int], None, None]:
"""parse instruction string features"""
if insn.is_ldstr():
yield String(insn.operand), insn.offset


def extract_features(
f: CilMethodBody, insn: Instruction
) -> Generator[Tuple[Union[API, String, Number], int], None, None]:
"""extract instruction features"""
for inst_handler in INSTRUCTION_HANDLERS:
for (feature, ea) in inst_handler(f, insn):
yield feature, ea


INSTRUCTION_HANDLERS = (
extract_insn_api_features,
extract_insn_number_features,
extract_insn_string_features,
)


def main(args):
""" """
dn = dnfile.dnPE(args.path)

features = []
for row in dn.net.mdtables.MethodDef:
if row.ImplFlags.miIL:
try:
body = read_dotnet_method_body(dn, row)
except MethodBodyFormatError as e:
print(e)
continue

for insn in body.instructions:
features.extend(list(extract_features(body, insn)))

import pprint

pprint.pprint(features)


if __name__ == "__main__":
""" """
import argparse

from capa.features.extractors.dotnet.helpers import read_dotnet_method_body

parser = argparse.ArgumentParser(prog="parse instruction features from .NET PE")
parser.add_argument("path", type=str, help="full path to .NET PE")

main(parser.parse_args())

0 comments on commit 574a30d

Please sign in to comment.