Last active
February 12, 2018 16:39
-
-
Save danizen/e9243c699b206a1c41107f7c8392b223 to your computer and use it in GitHub Desktop.
aggregation of crawled references using mongodb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"referrerLinkText": null, | |
"isRootParentReference": false, | |
"sitemapLastMod": null, | |
"parentRootReference": null, | |
"referrerLinkTag": null, | |
"sitemapChangeFreq": null, | |
"crawlState": "REJECTED", | |
"isValid": false, | |
"contentType": "text/html", | |
"stage": "PROCESSED", | |
"sitemapPriority": null, | |
"referrerReference": null, | |
"referrerLinkTitle": null, | |
"crawlDate": "2018-02-06T22:21:19.869000", | |
"reference": "https://stemcells.nih.gov", | |
"depth": 0, | |
"contentChecksum": null, | |
"originalReference": null, | |
"_id": "5a7a289639ec2e4736d0854b", | |
"metaChecksum": null | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Pipe the data back and build a chart like this */ | |
var chart = c3.generate({ | |
size: { | |
width: 960, | |
}, | |
data: { | |
x: 'x', | |
columns: [ | |
['x', '2018-02-05', '2018-02-06', '2018-02-07', '2018-02-08', '2018-02-09', '2018-02-10'], | |
['New', 300, 350, 300, 0, 0, 120], | |
['Redirect', 130, 100, 140, 200, 150, 50], | |
['Other', 12, 16, 20, 12, 10, 7], | |
], | |
types: { | |
New: 'area', | |
Redirect: 'area', | |
Other: 'area' | |
// 'line', 'spline', 'step', 'area', 'area-step' are also available to stack | |
}, | |
groups: [['New', 'Redirect', 'Other']], | |
colors: { | |
New: '#18993c', | |
Redirect: '#f4b642', | |
Other: 'f45342' | |
} | |
}, | |
axis: { | |
x: { | |
type: 'timeseries', | |
tick: { | |
format: '%Y-%m-%d' | |
} | |
}, | |
y: { | |
label: 'Pages' | |
} | |
} | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# assume refs is a reference to a Mongo collection | |
cursor = refs.aggregate([ | |
# the "$match" pipeline operator basically makes sure that these references are processed. | |
# they should have a crawlState, but just in case, we make sure of it. | |
{'$match': { | |
'stage': 'PROCESSED', | |
'crawlState': {'$exists': True}}, | |
}}, | |
# group here basically counts up the references in each state | |
{'$group': { | |
'_id': { 'outcome': '$crawlState' }, | |
'count': { '$sum': 1 } | |
}} | |
]) | |
# get the results below | |
list(cursor) | |
# In my case, I get the following: | |
[{'_id': {'outcome': 'REJECTED'}, 'count': 264}, | |
{'_id': {'outcome': 'BAD_STATUS'}, 'count': 89}, | |
{'_id': {'outcome': 'REDIRECT'}, 'count': 3511}, | |
{'_id': {'outcome': 'NOT_FOUND'}, 'count': 19}, | |
{'_id': {'outcome': 'ERROR'}, 'count': 50}, | |
{'_id': {'outcome': 'NEW'}, 'count': 11634}] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# again, we assume refs is the collection | |
from datetime import datetime, timedelta, time | |
now = datetime.now() | |
startofday = datetime.combine(now, time.min) | |
# these will be the buckets for our aggregation | |
boundaries = [datetime.min]+[startofday - timedelta(days=i) for i in range(5,0,-1)]+[now] | |
# Mongo wants the array of buckets to be sorted in ascending order, which is why we used range(5,0,-1) | |
# anyway, we get something like this: | |
[datetime.datetime(1, 1, 1, 0, 0), | |
datetime.datetime(2018, 2, 4, 0, 0), | |
datetime.datetime(2018, 2, 5, 0, 0), | |
datetime.datetime(2018, 2, 6, 0, 0), | |
datetime.datetime(2018, 2, 7, 0, 0), | |
datetime.datetime(2018, 2, 8, 0, 0), | |
datetime.datetime(2018, 2, 9, 15, 15, 3, 165743)] | |
# Now, we construct a bucketing based on boundaries | |
bucekt = {'$bucket': { | |
'groupBy': '$crawlDate', | |
'boundaries': boundaries, | |
'output': { | |
'count': { '$sum': 1 } | |
} | |
}} | |
# The two most popular crawlStates are 'NEW' and 'REDIRECT', so we lump the rest together: | |
cursor = refs.aggregate([ | |
# We again select references that have been processee, but this time we check that the dates are recent enough | |
{'$match': { | |
'stage': 'PROCESSED', | |
'crawlState': {'$exists': True}}, | |
'crawlDate': {'$gte': startofday - timedelta(days=5) }}, | |
}}, | |
# now we facet them | |
{'$facet':{ | |
'new': { | |
{'$match': {'crawlState': 'NEW' }}, bucket | |
], | |
'redirect': [ | |
{'$match': {'crawlState': 'REDIRECT'}}, bucket | |
], | |
'other': [ | |
{'$match': {'crawlState': { '$nin': [ 'NEW', 'REDIRECT' ]}}}, bucket | |
] | |
}} | |
]) | |
# For this calculation, I get reasonable results as I restarted the crawl from scratch recently | |
# and the shutdown was real enough that it is not running today | |
[{'new': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 837}, | |
{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 7098}, | |
{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 3699}], | |
'other': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 33}, | |
{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 261}, | |
{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 99}], | |
'redirect': [{'_id': datetime.datetime(2018, 2, 6, 0, 0), 'count': 69}, | |
{'_id': datetime.datetime(2018, 2, 7, 0, 0), 'count': 1829}, | |
{'_id': datetime.datetime(2018, 2, 8, 0, 0), 'count': 1613}]}] | |
# How can I do this without knowing the crawlStates in advance? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment