wbuchanan · September 24, 2020 12:29 · wbuchanan · Feb 2, 2017
diff --git a/iclogs.yml b/iclogs.yml
 ###########################################################################################################
 # I have this file saved as iclogs.yml, but you could name the file anything you wanted to.  When you     #
 # start up logstash you need to tell it where this file is using the -f flag on the command line like:    #       
 #                                                                                                         #
 #                               logstash -f iclogs.yml                                                    #
 #                                                                                                         #
 # There are still some bugs in the parsing resulting in keys that contain no name value, which might be   #
 # possible to clean up using the grep setting of the mutate filter.                                       #
 ###########################################################################################################

 # Define what logstash will be looking for in terms of input
 input {

    # Since we are throwing existing files at it need to install and use the file input plugin from the 
    # command line using : logstash-plugin install logstash-input-file
    file {

        # This tells logstash where to find the files and the type of files to look for
        path => "/Location/Where/logFiles/areStored/**/*.log"

        # And where to start the processing from
        start_position => "beginning"

    } # End of the file configuration block

    # If we were handling the log data in real time we could include an IP address and port that logstash 
    # would "listen" to for the data to stream in

 } # End of the input block

 # This is one of the more important sections since it is essentially defining how the data will be parsed
 filter {

    # This is another plugin.  To install use: logstash-plugin install logstash-filter-mutate
    mutate {

        # Make sure it is reading all the data as strings
        convert => { "message" => "string" }

    } # End of the first mutate block

    # The grok plugin defines a bunch of preformatted parsers (basically makes life easier)
    # logstash-plugin install logstash-filter-grok
    grok {

        # Defines one or more patterns than an input should match message is a default 
        # reference to the data coming in the "%{COMBINEDAPACHELOG}" part defines a few 
        # different things that get parsed into distinct fields.  This basically handles 
        # separating the majority of the data into distinct fields for us
        match => ["message", "%{COMBINEDAPACHELOG}"]

    } # End of the grok block

    # Next mutate block
    mutate {

        # Now we're going to use some regular expressions to handle some of the data in the 
        # referrer string (e.g., the page making the request to the server).  The space character 
        # in URL encoding is %20 and makes for not so friendly fields/values so every time this 
        # happens in the referrer string it will replace it with an underscore then it will look 
        # for the pattern "x." (need to escape the period which is why it is \. below) and remove it
        gsub => [ "referrer", "(%20)", "_", "referrer", "x\.", "", "referrer", "/\?", "/" ]
        
    } # End second mutate block

    # Another plugin that needs to be installed using: logstash-plugin install logstash-filter-kv
    # This is used to parse the query fields into key-value pairs 
    kv {

        # Any time a & or ? occur in the URL string it will treat it as a new key-value pair
        field_split => "&?"

        # But we are only going to do this to the referrer string
        source => "referrer"

        # And will only return unique key-value pairs from an entry
        allow_duplicate_values => false

        # And will remove spaces and commas from the key
        trimkey => " ,"

        # And make the value of the key lowercased
        transform_key => "lowercase"

        # And make the value of the value lowercased as well
        transform_value => "lowercase"

    } # End of kv block

 } # End of filter block

 # This is the last block where we tell logstash where to put the data
 output {

    # This is also a plugin that needs to be installed using:
    # logstash-plugin install logstash-output-elasticsearch
    # This handles a few different things and tells logstash that it is going to be 
    # pushing the data into an instance of elasticsearch (extremely fast search 
    # engine/NoSQL database)
    elasticsearch {

        # Location of the elasticsearch server
        hosts => ["127.0.0.1:9200"]

        # Indexes in elasticsearch are places where documents get stored it isn't the same as 
        # an index in an SQL table, but allows different documents to have distinct schemas 
        # and/or multiple types related to a single index without having the date included it puts 
        # all of the data into a single index so queries will be easier to write, but with a single 
        # node (instance) of elasticsearch and no sharding (spliting the files across multiple 
        # instances) the searching will probably be noticeably slower
        index => "iclogs"
        
        # A template is also needed to avoid an error that will be created from the number of 
        # fields generated from this process otherwise.  This should be the path to the file 
        # below in this Gist
        template => "/Location/ofThe/Template/File/iclogtemplate.yml"
        
        # As an alternative we can also create the indices based on the date of the log file
        # index -> "logstash-%{[timestamp]"

    } # End of elasticsearch block

 } # End of output block
diff --git a/iclogtemplate.yml b/iclogtemplate.yml
 # Creates an index template for IC log files
 {
        # Defines this as a template for the index creation
        "template": "iclogs",
                
        # Now provide some settings for the index
        "settings" : {   
                
                # Number of shards to create from the data
                "number_of_shards" : 5,      

                # Number of replicas to create
                "number_of_replicas": 0,   

                # Maximum number of fields in an index        
                "index.mapping.total_fields.limit": 10000
                        
        } # End of Settings block

 } # End of iclogs index template
	###########################################################################################################
	# I have this file saved as iclogs.yml, but you could name the file anything you wanted to. When you #
	# start up logstash you need to tell it where this file is using the -f flag on the command line like: #
	# #
	# logstash -f iclogs.yml #
	# #
	# There are still some bugs in the parsing resulting in keys that contain no name value, which might be #
	# possible to clean up using the grep setting of the mutate filter. #
	###########################################################################################################

	# Define what logstash will be looking for in terms of input
	input {

	# Since we are throwing existing files at it need to install and use the file input plugin from the
	# command line using : logstash-plugin install logstash-input-file
	file {

	# This tells logstash where to find the files and the type of files to look for
	path => "/Location/Where/logFiles/areStored/*/.log"

	# And where to start the processing from
	start_position => "beginning"

	} # End of the file configuration block

	# If we were handling the log data in real time we could include an IP address and port that logstash
	# would "listen" to for the data to stream in

	} # End of the input block

	# This is one of the more important sections since it is essentially defining how the data will be parsed
	filter {

	# This is another plugin. To install use: logstash-plugin install logstash-filter-mutate
	mutate {

	# Make sure it is reading all the data as strings
	convert => { "message" => "string" }

	} # End of the first mutate block

	# The grok plugin defines a bunch of preformatted parsers (basically makes life easier)
	# logstash-plugin install logstash-filter-grok
	grok {

	# Defines one or more patterns than an input should match message is a default
	# reference to the data coming in the "%{COMBINEDAPACHELOG}" part defines a few
	# different things that get parsed into distinct fields. This basically handles
	# separating the majority of the data into distinct fields for us
	match => ["message", "%{COMBINEDAPACHELOG}"]

	} # End of the grok block

	# Next mutate block
	mutate {

	# Now we're going to use some regular expressions to handle some of the data in the
	# referrer string (e.g., the page making the request to the server). The space character
	# in URL encoding is %20 and makes for not so friendly fields/values so every time this
	# happens in the referrer string it will replace it with an underscore then it will look
	# for the pattern "x." (need to escape the period which is why it is \. below) and remove it
	gsub => [ "referrer", "(%20)", "_", "referrer", "x\.", "", "referrer", "/\?", "/" ]

	} # End second mutate block

	# Another plugin that needs to be installed using: logstash-plugin install logstash-filter-kv
	# This is used to parse the query fields into key-value pairs
	kv {

	# Any time a & or ? occur in the URL string it will treat it as a new key-value pair
	field_split => "&?"

	# But we are only going to do this to the referrer string
	source => "referrer"

	# And will only return unique key-value pairs from an entry
	allow_duplicate_values => false

	# And will remove spaces and commas from the key
	trimkey => " ,"

	# And make the value of the key lowercased
	transform_key => "lowercase"

	# And make the value of the value lowercased as well
	transform_value => "lowercase"

	} # End of kv block

	} # End of filter block

	# This is the last block where we tell logstash where to put the data
	output {

	# This is also a plugin that needs to be installed using:
	# logstash-plugin install logstash-output-elasticsearch
	# This handles a few different things and tells logstash that it is going to be
	# pushing the data into an instance of elasticsearch (extremely fast search
	# engine/NoSQL database)
	elasticsearch {

	# Location of the elasticsearch server
	hosts => ["127.0.0.1:9200"]

	# Indexes in elasticsearch are places where documents get stored it isn't the same as
	# an index in an SQL table, but allows different documents to have distinct schemas
	# and/or multiple types related to a single index without having the date included it puts
	# all of the data into a single index so queries will be easier to write, but with a single
	# node (instance) of elasticsearch and no sharding (spliting the files across multiple
	# instances) the searching will probably be noticeably slower
	index => "iclogs"

	# A template is also needed to avoid an error that will be created from the number of
	# fields generated from this process otherwise. This should be the path to the file
	# below in this Gist
	template => "/Location/ofThe/Template/File/iclogtemplate.yml"

	# As an alternative we can also create the indices based on the date of the log file
	# index -> "logstash-%{[timestamp]"

	} # End of elasticsearch block

	} # End of output block
	# Creates an index template for IC log files
	{
	# Defines this as a template for the index creation
	"template": "iclogs",

	# Now provide some settings for the index
	"settings" : {

	# Number of shards to create from the data
	"number_of_shards" : 5,

	# Number of replicas to create
	"number_of_replicas": 0,

	# Maximum number of fields in an index
	"index.mapping.total_fields.limit": 10000

	} # End of Settings block

	} # End of iclogs index template