danizen · February 12, 2018 16:39
diff --git a/a-reference.json b/a-reference.json
 {
  "referrerLinkText": null,
  "isRootParentReference": false,
  "sitemapLastMod": null,
  "parentRootReference": null,
  "referrerLinkTag": null,
  "sitemapChangeFreq": null,
  "crawlState": "REJECTED",
  "isValid": false,
  "contentType": "text/html",
  "stage": "PROCESSED",
  "sitemapPriority": null,
  "referrerReference": null,
  "referrerLinkTitle": null,
  "crawlDate": "2018-02-06T22:21:19.869000",
  "reference": "https://stemcells.nih.gov",
  "depth": 0,
  "contentChecksum": null,
  "originalReference": null,
  "_id": "5a7a289639ec2e4736d0854b",
  "metaChecksum": null
 }
diff --git a/build-chart.js b/build-chart.js
 /* Pipe the data back and build a chart like this */

 var chart = c3.generate({
    size: {
      width: 960,
    },
    data: {
        x: 'x',
        columns: [
            ['x', '2018-02-05', '2018-02-06', '2018-02-07', '2018-02-08', '2018-02-09', '2018-02-10'],
            ['New', 300, 350, 300, 0, 0, 120],
            ['Redirect', 130, 100, 140, 200, 150, 50],
            ['Other', 12, 16, 20, 12, 10, 7],
        ],
        types: {
            New: 'area',
            Redirect: 'area',
            Other: 'area'
            // 'line', 'spline', 'step', 'area', 'area-step' are also available to stack
        },
        groups: [['New', 'Redirect', 'Other']],
        colors: {
            New: '#18993c',
            Redirect: '#f4b642',
            Other: 'f45342'
        }
    },
    axis: {
        x: {
            type: 'timeseries',
            tick: {
                format: '%Y-%m-%d'
            }
        },
        y: {
            label: 'Pages'
        }
    }
 });
diff --git a/distinct-crawl-states.py b/distinct-crawl-states.py
 # assume refs is a reference to a Mongo collection

 cursor = refs.aggregate([
    # the "$match" pipeline operator basically makes sure that these references are processed.
    # they should have a crawlState, but just in case, we make sure of it.
    {'$match': {
        'stage': 'PROCESSED',
        'crawlState': {'$exists': True}},
    }},
    # group here basically counts up the references in each state
    {'$group': {
        '_id': { 'outcome': '$crawlState' },
        'count': { '$sum': 1 }
    }}
 ])

 # get the results below
 list(cursor)

 # In my case, I get the following:
 [{'_id': {'outcome': 'REJECTED'}, 'count': 264},
 {'_id': {'outcome': 'BAD_STATUS'}, 'count': 89},
 {'_id': {'outcome': 'REDIRECT'}, 'count': 3511},
 {'_id': {'outcome': 'NOT_FOUND'}, 'count': 19},
 {'_id': {'outcome': 'ERROR'}, 'count': 50},
 {'_id': {'outcome': 'NEW'}, 'count': 11634}]
diff --git a/stacked-bar-graph.py b/stacked-bar-graph.py
 # again, we assume refs is the collection

 from datetime import datetime, timedelta, time

 now = datetime.now()
 startofday = datetime.combine(now, time.min)

 # these will be the buckets for our aggregation
 boundaries = [datetime.min]+[startofday - timedelta(days=i) for i in range(5,0,-1)]+[now]

 # Mongo wants the array of buckets to be sorted in ascending order, which is why we used range(5,0,-1)
 # anyway, we get something like this:

 [datetime.datetime(1, 1, 1, 0, 0),
 datetime.datetime(2018, 2, 4, 0, 0),
 datetime.datetime(2018, 2, 5, 0, 0),
 datetime.datetime(2018, 2, 6, 0, 0),
 datetime.datetime(2018, 2, 7, 0, 0),
 datetime.datetime(2018, 2, 8, 0, 0),
 datetime.datetime(2018, 2, 9, 15, 15, 3, 165743)]

 # Now, we construct a bucketing based on boundaries

 bucekt = {'$bucket': {
  'groupBy': '$crawlDate',
  'boundaries': boundaries,
  'output': {
    'count': { '$sum': 1 }
  }
 }}

 # The two most popular crawlStates are 'NEW' and 'REDIRECT', so we lump the rest together:

 cursor = refs.aggregate([
  # We again select references that have been processee, but this time we check that the dates are recent enough
  {'$match': {
    'stage': 'PROCESSED',
    'crawlState': {'$exists': True}},
    'crawlDate': {'$gte': startofday - timedelta(days=5) }},
  }},
  # now we facet them
  {'$facet':{
    'new': {
      {'$match': {'crawlState': 'NEW' }}, bucket
    ],
    'redirect': [
      {'$match': {'crawlState': 'REDIRECT'}}, bucket
    ],
    'other': [
      {'$match': {'crawlState': { '$nin': [ 'NEW', 'REDIRECT' ]}}}, bucket
    ]
  }}
 ])
   
   
 # For this calculation, I get reasonable results as I restarted the crawl from scratch recently
 # and the shutdown was real enough that it is not running today
 [{'new': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 837},
   {'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 7098},
   {'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 3699}],
  'other': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 33},
   {'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 261},
   {'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 99}],
  'redirect': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 69},
   {'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 1829},
   {'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 1613}]}]
   
 # How can I do this without knowing the crawlStates in advance?
	{
	"referrerLinkText": null,
	"isRootParentReference": false,
	"sitemapLastMod": null,
	"parentRootReference": null,
	"referrerLinkTag": null,
	"sitemapChangeFreq": null,
	"crawlState": "REJECTED",
	"isValid": false,
	"contentType": "text/html",
	"stage": "PROCESSED",
	"sitemapPriority": null,
	"referrerReference": null,
	"referrerLinkTitle": null,
	"crawlDate": "2018-02-06T22:21:19.869000",
	"reference": "https://stemcells.nih.gov",
	"depth": 0,
	"contentChecksum": null,
	"originalReference": null,
	"_id": "5a7a289639ec2e4736d0854b",
	"metaChecksum": null
	}
	/* Pipe the data back and build a chart like this */

	var chart = c3.generate({
	size: {
	width: 960,
	},
	data: {
	x: 'x',
	columns: [
	['x', '2018-02-05', '2018-02-06', '2018-02-07', '2018-02-08', '2018-02-09', '2018-02-10'],
	['New', 300, 350, 300, 0, 0, 120],
	['Redirect', 130, 100, 140, 200, 150, 50],
	['Other', 12, 16, 20, 12, 10, 7],
	],
	types: {
	New: 'area',
	Redirect: 'area',
	Other: 'area'
	// 'line', 'spline', 'step', 'area', 'area-step' are also available to stack
	},
	groups: [['New', 'Redirect', 'Other']],
	colors: {
	New: '#18993c',
	Redirect: '#f4b642',
	Other: 'f45342'
	}
	},
	axis: {
	x: {
	type: 'timeseries',
	tick: {
	format: '%Y-%m-%d'
	}
	},
	y: {
	label: 'Pages'
	}
	}
	});
	# assume refs is a reference to a Mongo collection

	cursor = refs.aggregate([
	# the "$match" pipeline operator basically makes sure that these references are processed.
	# they should have a crawlState, but just in case, we make sure of it.
	{'$match': {
	'stage': 'PROCESSED',
	'crawlState': {'$exists': True}},
	}},
	# group here basically counts up the references in each state
	{'$group': {
	'_id': { 'outcome': '$crawlState' },
	'count': { '$sum': 1 }
	}}
	])

	# get the results below
	list(cursor)

	# In my case, I get the following:
	[{'_id': {'outcome': 'REJECTED'}, 'count': 264},
	{'_id': {'outcome': 'BAD_STATUS'}, 'count': 89},
	{'_id': {'outcome': 'REDIRECT'}, 'count': 3511},
	{'_id': {'outcome': 'NOT_FOUND'}, 'count': 19},
	{'_id': {'outcome': 'ERROR'}, 'count': 50},
	{'_id': {'outcome': 'NEW'}, 'count': 11634}]
	# again, we assume refs is the collection

	from datetime import datetime, timedelta, time

	now = datetime.now()
	startofday = datetime.combine(now, time.min)

	# these will be the buckets for our aggregation
	boundaries = [datetime.min]+[startofday - timedelta(days=i) for i in range(5,0,-1)]+[now]

	# Mongo wants the array of buckets to be sorted in ascending order, which is why we used range(5,0,-1)
	# anyway, we get something like this:

	[datetime.datetime(1, 1, 1, 0, 0),
	datetime.datetime(2018, 2, 4, 0, 0),
	datetime.datetime(2018, 2, 5, 0, 0),
	datetime.datetime(2018, 2, 6, 0, 0),
	datetime.datetime(2018, 2, 7, 0, 0),
	datetime.datetime(2018, 2, 8, 0, 0),
	datetime.datetime(2018, 2, 9, 15, 15, 3, 165743)]

	# Now, we construct a bucketing based on boundaries

	bucekt = {'$bucket': {
	'groupBy': '$crawlDate',
	'boundaries': boundaries,
	'output': {
	'count': { '$sum': 1 }
	}
	}}

	# The two most popular crawlStates are 'NEW' and 'REDIRECT', so we lump the rest together:

	cursor = refs.aggregate([
	# We again select references that have been processee, but this time we check that the dates are recent enough
	{'$match': {
	'stage': 'PROCESSED',
	'crawlState': {'$exists': True}},
	'crawlDate': {'$gte': startofday - timedelta(days=5) }},
	}},
	# now we facet them
	{'$facet':{
	'new': {
	{'$match': {'crawlState': 'NEW' }}, bucket
	],
	'redirect': [
	{'$match': {'crawlState': 'REDIRECT'}}, bucket
	],
	'other': [
	{'$match': {'crawlState': { '$nin': [ 'NEW', 'REDIRECT' ]}}}, bucket
	]
	}}
	])


	# For this calculation, I get reasonable results as I restarted the crawl from scratch recently
	# and the shutdown was real enough that it is not running today
	[{'new': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 837},
	{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 7098},
	{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 3699}],
	'other': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 33},
	{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 261},
	{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 99}],
	'redirect': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 69},
	{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 1829},
	{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 1613}]}]

	# How can I do this without knowing the crawlStates in advance?