b1tninja · January 13, 2018 13:37
diff --git a/mapping.json b/mapping.json
 {
  "masscan": {
    "order": 0,
    "template": "masscan-*",
    "settings": {
      "index": {
        "number_of_shards": "3",
        "number_of_replicas": "1"
      }
    },
    "mappings": {
      "ReasonStatus": {
        "properties": {
          "reason": {
            "index": "not_analyzed",
            "type": "string"
          }
        }
      },
      "BannerStatus": {
        "properties": {
          "banner": {
            "type": "string",
            "fields": {
              "raw": {
                "index": "not_analyzed",
                "type": "string"
              }
            }
          },
          "app_proto": {
            "index": "not_analyzed",
            "type": "string"
          }
        }
      },
      "_default_": {
        "_all": {
          "enabled": false
        },
        "properties": {
          "ip_proto": {
            "index": "not_analyzed",
            "type": "string"
          },
          "reason": {
            "index": "not_analyzed",
            "type": "string"
          },
          "ip": {
            "type": "ip"
          },
          "origin": {
            "index": "no",
            "type": "string"
          },
          "ttl": {
            "type": "integer"
          },
          "status": {
            "index": "not_analyzed",
            "type": "string"
          },
          "timestamp": {
            "format": "epoch_second",
            "type": "date"
          }
        }
      }
    },
    "aliases": {}
  }
 }
diff --git a/readscan.py b/readscan.py
 import logging
 import mmap
 import struct
 from datetime import datetime
 from enum import IntEnum
 from functools import partial
 from glob import glob
 from ipaddress import IPv4Address
 from argparse import ArgumentParser
 import elasticsearch
 from elasticsearch.helpers import parallel_bulk

 logging.basicConfig(level=logging.INFO)

 # from elasticsearch.serializer import JSONSerializer # TODO: implement as serializer

 class Timestamp(int):
    def __repr__(self):
        return datetime.fromtimestamp(self).isoformat()

 class CustomIntEnum(IntEnum):
    def __str__(self):
        return self.name


 class RecordStatus(CustomIntEnum):
    Open = 1
    Closed = 2
    Banner1 = 5
    Open2 = 6
    Closed2 = 7
    Arp2 = 8
    Banner9 = 9,


 class InternetProtocol(CustomIntEnum):
    ARP = 0
    ICMP = 1
    TCP = 6
    UDP = 17
    SCTP = 132


 class ApplicationProtocol(CustomIntEnum):
    PROTO_NONE = 0
    PROTO_HEUR = 1
    PROTO_SSH1 = 2
    PROTO_SSH2 = 3
    PROTO_HTTP = 4
    PROTO_FTP = 5
    PROTO_DNS_VERSIONBIND = 6
    PROTO_SNMP = 7
    PROTO_NBTSTAT = 8
    PROTO_SSL3 = 9
    PROTO_SMTP = 10
    PROTO_POP3 = 11
    PROTO_IMAP4 = 12
    PROTO_UDP_ZEROACCESS = 13
    PROTO_X509_CERT = 14
    PROTO_HTML_TITLE = 15
    PROTO_HTML_FULL = 16
    PROTO_NTP = 17
    PROTO_VULN = 18
    PROTO_HEARTBLEED = 19
    PROTO_VNC_RFB = 20
    PROTO_SAFE = 21


 class Bitset(list):
    labels = []

    def __init__(self, value):
        assert isinstance(value, int)
        # TODO: support list of labels -> int
        self.value = value
        super().__init__(list(self))

    def __int__(self):
        return self.value

    def __iter__(self):
        n = 1
        i = 0
        while n <= self.value:
            if self.value & n:
                yield self.labels[i] if len(self.labels) > i and self.labels[i] else hex(n)
            i += 1
            n = 1 << i


 class ReasonFlags(Bitset):
    labels = ['FIN', 'SYN', 'RST', 'PSH', 'ACK', 'URG', 'ECE', 'CWR']


 class MassscanStatus(object):
    def __init__(self, status, timestamp, ip, ip_proto, port, ttl):
        self.status = status  # TODO: make more consistent with other args
        self.timestamp = Timestamp(timestamp)
        self.ip = IPv4Address(ip)
        self.ip_proto = InternetProtocol(ip_proto)
        self.port = port
        self.ttl = ttl

    def __repr__(self):
        return '%s: %s' % (self.__class__.__name__, self.__dict__)


 class ReasonStatus(MassscanStatus):
    def __init__(self, status, timestamp, ip, ip_proto, port, reason, ttl):
        super(ReasonStatus, self).__init__(status, timestamp, ip, ip_proto, port, ttl)
        self.reason = ReasonFlags(reason)


 class BannerStatus(MassscanStatus):
    def __init__(self, status, timestamp, ip, ip_proto, port, app_proto, ttl, banner):
        super(BannerStatus, self).__init__(status, timestamp, ip, ip_proto, port, ttl)
        self.app_proto = ApplicationProtocol(app_proto)
        self.banner = banner


 class MasscanReader:
    compat_version = b'masscan/1.1'

    def __init__(self, path):
        self.path = path
        self.fh = None
        self.buffer = None
        try:
            fh = open(path, 'rb')
            buffer = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
        except Exception as e:
            logging.critical(e)
        else:
            self.fh = fh
            self.buffer = buffer
            # Read the header of the file and asses compatability
            headers = buffer[0:100].rstrip(b"\x00").split()

            assert headers[0].startswith(self.compat_version)
            self.version = headers[0]

            for header in headers[1:]:
                header = header.decode('ascii')
                if ':' in header:
                    (header_type, value) = tuple(header.split(':', 1))
                    if header_type == 's':
                        # Not sure if this is intended to be a generic string or indication of start time
                        self.start_time = datetime.fromtimestamp(int(value))
                else:
                    logging.warning("%s: has unknown header: %s", self, header)
                    # TODO: parse other headers?

            footers = buffer[buffer.size() - 99:].rstrip(b"\x00").split(b"\n")
            if footers[0] == self.compat_version:
                self.footers = footers
            else:
                logging.warning("%s: has an invalid footer, incomplete file?", self)

    def __del__(self):
        if self.buffer:
            self.buffer.close()
        if self.fh:
            self.fh.close()

    def __repr__(self):
        return "%s: %s, timestamp: %s" % (self.__class__.__name__, self.path, self.start_time)

    def __iter__(self):
        logging.debug("Parsing results from: %s", self.path)
        offset = 99
        try:
            while offset < self.buffer.size() - 99:
                status = RecordStatus(self.buffer[offset])
                length_width = 2
                length = self.buffer[offset + 1]
                if status in [RecordStatus.Open2, RecordStatus.Closed2, RecordStatus.Arp2]:
                    assert length == 13
                    (timestamp, ip, ip_proto, port, reason, ttl) = struct.unpack_from('>LLBHBB', self.buffer,
                                                                                      offset + 2)
                    status = ReasonStatus(status, timestamp, ip, ip_proto, port, reason, ttl)
                elif status == RecordStatus.Banner9:
                    if length >= 128:
                        length_width = 3
                        if self.buffer[offset + 2] > 0b01111111:
                            logging.warning("its happening")  # TODO: test this
                        length = ((self.buffer[offset + 1] & 0b01111111) << 7) | \
                                 (self.buffer[offset + 2] & 0b01111111)

                    (timestamp, ip, ip_proto, port, app_proto, ttl) = struct.unpack_from('>LLBHHB',
                                                                                         self.buffer,
                                                                                         offset + length_width)
                    banner = self.buffer[offset + length_width + 14:offset + length + length_width].decode('latin-1')
                    status = BannerStatus(status, timestamp, ip, ip_proto, port, app_proto, ttl, banner)
                else:
                    break

                offset += length + length_width
                yield status
        except Exception as e:
            logging.warning('%s: %s @ ', self, e, offset)

    @classmethod
    def iter_results_glob(cls, pattern):
        scan_files = glob(pattern)
        for n, scan_file in enumerate(reversed(sorted(scan_files))):
            logging.debug("%d of %d: %s", n, len(scan_files), scan_file)
            yield cls(scan_file)


 class MasscanIndexer:
    INDEX_VERSION = 1

    def __init__(self, dbs):
        self.es = elasticsearch.Elasticsearch(dbs, timeout=10)

    def index_glob(self, pattern):
        for scan in MasscanReader.iter_results_glob(pattern):
            self._index(scan)

    def index(self, path):
        scan = MasscanReader(path)
        self._index(scan)

    def _index(self, scan):
        logging.debug("Indexing: %s", scan)
        try:
            def StatusToDict(status, **kwargs):
                assert isinstance(status, MassscanStatus)
                # d = dict([(k, str(v)) if isinstance(v, int) else (k, v) for k, v in status.__dict__.items()])
                d = dict(status.__dict__)
                for k,v in d.items():
                    if isinstance(v, CustomIntEnum):
                        d[k] = str(v)
                    elif isinstance(v, IPv4Address):
                        d[k] = v.compressed
                    elif isinstance(v, Bitset):
                        d[k] = list(v)
                d.update(kwargs)
                d['_type'] = status.__class__.__name__
                return d

            for success, info in parallel_bulk(self.es,
                                               map(partial(StatusToDict,
                                                           _index='masscan-%d' % self.INDEX_VERSION,
                                                           origin=scan.path),
                                                   scan)):
                if success:
                    logging.debug(info)
                else:
                    logging.warning(info)

        # except elasticsearch.helpers.BulkIndexError as e:
        except Exception as e:
            logging.warning(e)


 if __name__ == '__main__':
    parser = ArgumentParser()
    parser.add_argument('glob', help='Glob pattern to index')
    parser.add_argument('hosts', type=str, metavar='host', nargs='+', help='Elasticsearch hosts')
    args = parser.parse_args()
    MasscanIndexer(args.hosts).index_glob(args.glob)
	{
	"masscan": {
	"order": 0,
	"template": "masscan-*",
	"settings": {
	"index": {
	"number_of_shards": "3",
	"number_of_replicas": "1"
	}
	},
	"mappings": {
	"ReasonStatus": {
	"properties": {
	"reason": {
	"index": "not_analyzed",
	"type": "string"
	}
	}
	},
	"BannerStatus": {
	"properties": {
	"banner": {
	"type": "string",
	"fields": {
	"raw": {
	"index": "not_analyzed",
	"type": "string"
	}
	}
	},
	"app_proto": {
	"index": "not_analyzed",
	"type": "string"
	}
	}
	},
	"_default_": {
	"_all": {
	"enabled": false
	},
	"properties": {
	"ip_proto": {
	"index": "not_analyzed",
	"type": "string"
	},
	"reason": {
	"index": "not_analyzed",
	"type": "string"
	},
	"ip": {
	"type": "ip"
	},
	"origin": {
	"index": "no",
	"type": "string"
	},
	"ttl": {
	"type": "integer"
	},
	"status": {
	"index": "not_analyzed",
	"type": "string"
	},
	"timestamp": {
	"format": "epoch_second",
	"type": "date"
	}
	}
	}
	},
	"aliases": {}
	}
	}
	import logging
	import mmap
	import struct
	from datetime import datetime
	from enum import IntEnum
	from functools import partial
	from glob import glob
	from ipaddress import IPv4Address
	from argparse import ArgumentParser
	import elasticsearch
	from elasticsearch.helpers import parallel_bulk

	logging.basicConfig(level=logging.INFO)

	# from elasticsearch.serializer import JSONSerializer # TODO: implement as serializer

	class Timestamp(int):
	def __repr__(self):
	return datetime.fromtimestamp(self).isoformat()

	class CustomIntEnum(IntEnum):
	def __str__(self):
	return self.name


	class RecordStatus(CustomIntEnum):
	Open = 1
	Closed = 2
	Banner1 = 5
	Open2 = 6
	Closed2 = 7
	Arp2 = 8
	Banner9 = 9,


	class InternetProtocol(CustomIntEnum):
	ARP = 0
	ICMP = 1
	TCP = 6
	UDP = 17
	SCTP = 132


	class ApplicationProtocol(CustomIntEnum):
	PROTO_NONE = 0
	PROTO_HEUR = 1
	PROTO_SSH1 = 2
	PROTO_SSH2 = 3
	PROTO_HTTP = 4
	PROTO_FTP = 5
	PROTO_DNS_VERSIONBIND = 6
	PROTO_SNMP = 7
	PROTO_NBTSTAT = 8
	PROTO_SSL3 = 9
	PROTO_SMTP = 10
	PROTO_POP3 = 11
	PROTO_IMAP4 = 12
	PROTO_UDP_ZEROACCESS = 13
	PROTO_X509_CERT = 14
	PROTO_HTML_TITLE = 15
	PROTO_HTML_FULL = 16
	PROTO_NTP = 17
	PROTO_VULN = 18
	PROTO_HEARTBLEED = 19
	PROTO_VNC_RFB = 20
	PROTO_SAFE = 21


	class Bitset(list):
	labels = []

	def __init__(self, value):
	assert isinstance(value, int)
	# TODO: support list of labels -> int
	self.value = value
	super().__init__(list(self))

	def __int__(self):
	return self.value

	def __iter__(self):
	n = 1
	i = 0
	while n <= self.value:
	if self.value & n:
	yield self.labels[i] if len(self.labels) > i and self.labels[i] else hex(n)
	i += 1
	n = 1 << i


	class ReasonFlags(Bitset):
	labels = ['FIN', 'SYN', 'RST', 'PSH', 'ACK', 'URG', 'ECE', 'CWR']


	class MassscanStatus(object):
	def __init__(self, status, timestamp, ip, ip_proto, port, ttl):
	self.status = status # TODO: make more consistent with other args
	self.timestamp = Timestamp(timestamp)
	self.ip = IPv4Address(ip)
	self.ip_proto = InternetProtocol(ip_proto)
	self.port = port
	self.ttl = ttl

	def __repr__(self):
	return '%s: %s' % (self.__class__.__name__, self.__dict__)


	class ReasonStatus(MassscanStatus):
	def __init__(self, status, timestamp, ip, ip_proto, port, reason, ttl):
	super(ReasonStatus, self).__init__(status, timestamp, ip, ip_proto, port, ttl)
	self.reason = ReasonFlags(reason)


	class BannerStatus(MassscanStatus):
	def __init__(self, status, timestamp, ip, ip_proto, port, app_proto, ttl, banner):
	super(BannerStatus, self).__init__(status, timestamp, ip, ip_proto, port, ttl)
	self.app_proto = ApplicationProtocol(app_proto)
	self.banner = banner


	class MasscanReader:
	compat_version = b'masscan/1.1'

	def __init__(self, path):
	self.path = path
	self.fh = None
	self.buffer = None
	try:
	fh = open(path, 'rb')
	buffer = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
	except Exception as e:
	logging.critical(e)
	else:
	self.fh = fh
	self.buffer = buffer
	# Read the header of the file and asses compatability
	headers = buffer[0:100].rstrip(b"\x00").split()

	assert headers[0].startswith(self.compat_version)
	self.version = headers[0]

	for header in headers[1:]:
	header = header.decode('ascii')
	if ':' in header:
	(header_type, value) = tuple(header.split(':', 1))
	if header_type == 's':
	# Not sure if this is intended to be a generic string or indication of start time
	self.start_time = datetime.fromtimestamp(int(value))
	else:
	logging.warning("%s: has unknown header: %s", self, header)
	# TODO: parse other headers?

	footers = buffer[buffer.size() - 99:].rstrip(b"\x00").split(b"\n")
	if footers[0] == self.compat_version:
	self.footers = footers
	else:
	logging.warning("%s: has an invalid footer, incomplete file?", self)

	def __del__(self):
	if self.buffer:
	self.buffer.close()
	if self.fh:
	self.fh.close()

	def __repr__(self):
	return "%s: %s, timestamp: %s" % (self.__class__.__name__, self.path, self.start_time)

	def __iter__(self):
	logging.debug("Parsing results from: %s", self.path)
	offset = 99
	try:
	while offset < self.buffer.size() - 99:
	status = RecordStatus(self.buffer[offset])
	length_width = 2
	length = self.buffer[offset + 1]
	if status in [RecordStatus.Open2, RecordStatus.Closed2, RecordStatus.Arp2]:
	assert length == 13
	(timestamp, ip, ip_proto, port, reason, ttl) = struct.unpack_from('>LLBHBB', self.buffer,
	offset + 2)
	status = ReasonStatus(status, timestamp, ip, ip_proto, port, reason, ttl)
	elif status == RecordStatus.Banner9:
	if length >= 128:
	length_width = 3
	if self.buffer[offset + 2] > 0b01111111:
	logging.warning("its happening") # TODO: test this
	length = ((self.buffer[offset + 1] & 0b01111111) << 7) \| \
	(self.buffer[offset + 2] & 0b01111111)

	(timestamp, ip, ip_proto, port, app_proto, ttl) = struct.unpack_from('>LLBHHB',
	self.buffer,
	offset + length_width)
	banner = self.buffer[offset + length_width + 14:offset + length + length_width].decode('latin-1')
	status = BannerStatus(status, timestamp, ip, ip_proto, port, app_proto, ttl, banner)
	else:
	break

	offset += length + length_width
	yield status
	except Exception as e:
	logging.warning('%s: %s @ ', self, e, offset)

	@classmethod
	def iter_results_glob(cls, pattern):
	scan_files = glob(pattern)
	for n, scan_file in enumerate(reversed(sorted(scan_files))):
	logging.debug("%d of %d: %s", n, len(scan_files), scan_file)
	yield cls(scan_file)


	class MasscanIndexer:
	INDEX_VERSION = 1

	def __init__(self, dbs):
	self.es = elasticsearch.Elasticsearch(dbs, timeout=10)

	def index_glob(self, pattern):
	for scan in MasscanReader.iter_results_glob(pattern):
	self._index(scan)

	def index(self, path):
	scan = MasscanReader(path)
	self._index(scan)

	def _index(self, scan):
	logging.debug("Indexing: %s", scan)
	try:
	def StatusToDict(status, **kwargs):
	assert isinstance(status, MassscanStatus)
	# d = dict([(k, str(v)) if isinstance(v, int) else (k, v) for k, v in status.__dict__.items()])
	d = dict(status.__dict__)
	for k,v in d.items():
	if isinstance(v, CustomIntEnum):
	d[k] = str(v)
	elif isinstance(v, IPv4Address):
	d[k] = v.compressed
	elif isinstance(v, Bitset):
	d[k] = list(v)
	d.update(kwargs)
	d['_type'] = status.__class__.__name__
	return d

	for success, info in parallel_bulk(self.es,
	map(partial(StatusToDict,
	_index='masscan-%d' % self.INDEX_VERSION,
	origin=scan.path),
	scan)):
	if success:
	logging.debug(info)
	else:
	logging.warning(info)

	# except elasticsearch.helpers.BulkIndexError as e:
	except Exception as e:
	logging.warning(e)


	if __name__ == '__main__':
	parser = ArgumentParser()
	parser.add_argument('glob', help='Glob pattern to index')
	parser.add_argument('hosts', type=str, metavar='host', nargs='+', help='Elasticsearch hosts')
	args = parser.parse_args()
	MasscanIndexer(args.hosts).index_glob(args.glob)