-
Notifications
You must be signed in to change notification settings - Fork 4
/
drop_nontrain.py
52 lines (42 loc) · 1.05 KB
/
drop_nontrain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from pandas import DataFrame
from pyarrow import Table
import schema
nontraining_fields = [
"domain_name",
"dns_evaluated_on",
"rdap_evaluated_on",
"tls_evaluated_on",
# IP data
"ip_data",
"countries",
"latitudes",
"longitudes",
# DNS
"dns_dnssec",
"dns_zone_dnskey_selfsign_ok",
"dns_email_extras",
"dns_ttls",
"dns_zone",
"dns_zone_SOA",
*[f"dns_{t}" for t in schema.dns_types_all],
"rdap_registration_date",
"rdap_last_changed_date",
"rdap_expiration_date",
"rdap_dnssec",
"rdap_entities"
#"tls_root_cert_validity_remaining",
#"tls_leaf_cert_validity_remaining"
]
def drop_nontrain_table(table: Table):
"""
Drop non-training columns.
"""
fields = [x for x in nontraining_fields if x in table.column_names]
return table.drop(fields)
def drop_nontrain_df(df: DataFrame) -> DataFrame:
"""
Drop non-training columns.
"""
fields = [x for x in nontraining_fields if x in df.columns]
df.drop(columns=fields, inplace=True)
return df