Source code for random_mac.dataset

"""
This module contains dataset-related functions.
"""


import csv
import itertools
import os
import macaddress
import numpy
import pandas


[docs]def get_ieee_assignments(file): """ Retrieve OUIs and CIDs. Parameters ---------- file : str The name of a file with information on OUIs and CIDs assigned by the IEEE. Typical names are `oui.csv` and `cid.csv`. Returns ------- list A list of 24-bit OUIs or CIDs assigned by the IEEE. """ with open(file) as source: records = csv.DictReader(source) return list( map( lambda record: record["Assignment"], records ) )
[docs]def make_hexadecimal_digit_strings(assignments): """ Make hexadecimal strings based upon OUIs and CIDs. Parameters ---------- assignments : list A list of 24-bit OUIs or CIDs assigned by the IEEE. Returns ------- list A list of 48-bit hexadecimal strings, where each string is the concatenation of a 24-bit OUI/CID and 24 random bits. """ return list( map( lambda assignment: assignment + os.urandom(3).hex(), assignments ) )
[docs]def make_random_hexadecimal_digit_strings(number): """ Make random hexadecimal strings. Parameters ---------- number : int The number of hexadecimal strings to make. Returns ------- list A list of 48-bit hexadecimal strings, where each string is 48 random bits. """ return list( map( lambda x: os.urandom(6).hex(), range(number) ) )
[docs]def get_mac_features(digit_string): """ Retrieve the features of a MAC address. Parameters ---------- digit_string : str A 48-bit hexadecimal string with which to instantiate `MediaAccessControlAddress`. Returns ------- tuple An eight-tuple with the features of a MAC address. The features are `type`, `has_oui`, `has_cid`, `is_broadcast`, `is_multicast`, `is_unicast`, `is_uaa`, and `is_laa`. """ mac = macaddress.MediaAccessControlAddress(digit_string) return ( mac.type, mac.has_oui, mac.has_cid, mac.is_broadcast, mac.is_multicast, mac.is_unicast, mac.is_uaa, mac.is_laa )
[docs]def get_features(digit_strings): """ Retrieve the features of MAC addresses. Parameters ---------- digit_strings : list A list of 48-bit hexadecimal strings. Returns ------- list A list of tuples, where each tuple contains the features of a MAC address. """ return list( map( lambda digit_string: get_mac_features(digit_string), digit_strings ) )
[docs]def normalize_features(features): """ Normalize the features of MAC addresses. Parameters ---------- features : list A list of tuples, where each tuple contains the features of a MAC address. Returns ------- numpy array A numpy array with the normalized features of MAC addresses, where normalization means replacing non-numeric with numeric values and converting the container from a list to a numpy array. """ replacements = { "unique": 2, "local": 1, "unknown": 0, True: 1, False: 0 } return pandas.DataFrame(features).replace(replacements).to_numpy()
[docs]def make_labels(value, number): """ Make labels for training and testing of a binary classifier. Parameters ---------- value : int The label, where `0` means a non-random MAC addresses and `1` means a random MAC address. number : int The number of labels. Returns ------- list A list with the given number of the given label. """ return list( itertools.repeat( value, number ) )
[docs]def normalize_labels(labels): """ Normalize labels. Parameters ---------- labels : list A list of labels. Returns ------- numpy array A numpy array with normalized labels, where normalization means converting the container from a list to a numpy array. """ return numpy.array(labels)
[docs]def make(multiple, oui_file="./oui.csv", cid_file="./cid.csv"): """ Make a dataset for training and testing purposes. Parameters ---------- multiple : int The number of random MAC addresses to create for every non-random MAC address. oui_file : str The name of the file with OUIs assigned by the IEEE. cid_file : str The name of the file with CIDs assigned by the IEEE. Returns ------- tuple A tuple with data (features) and labels. """ # Get OUIs and CIDs. ouis = get_ieee_assignments(oui_file) cids = get_ieee_assignments(cid_file) # Make non-random and random hexadecimal strings. digits = make_hexadecimal_digit_strings(ouis + cids) random_digits = make_random_hexadecimal_digit_strings( int(multiple * len(digits)) ) # Get features of non-random and random MAC addresses. features = get_features(digits) random_features = get_features(random_digits) # Get labels for non-random and random MAC addresses. labels = make_labels(0, len(features)) random_labels = make_labels(1, len(random_features)) # Normalize all features and labels. normalized_features = normalize_features(features + random_features) normalized_labels = normalize_labels(labels + random_labels) # Return normalized features and labels. return ( normalized_features, normalized_labels, )