Skip to content

Instantly share code, notes, and snippets.

@stevelacey
Created June 26, 2014 10:46
Show Gist options
  • Select an option

  • Save stevelacey/0c24263b3bd204e6ccdd to your computer and use it in GitHub Desktop.

Select an option

Save stevelacey/0c24263b3bd204e6ccdd to your computer and use it in GitHub Desktop.
select
max(x.previous_crawl_id) as previous_crawl_id,
x.crawl_id,
max(x.previous_crawl_date) as previous_crawl_date,
x.crawl_date,
sum(a.body_words) as previous_body_words,
sum(b.body_words) as body_words,
(sum(b.body_words) - sum(a.body_words)) as body_words_diff,
sum(a.article_words) as previous_article_words,
sum(b.article_words) as article_words,
(sum(b.article_words) - sum(a.article_words)) as article_words_diff,
sum(a.page_or_article_words) as previous_page_or_article_words,
sum(b.page_or_article_words) as page_or_article_words,
(sum(b.page_or_article_words) - sum(a.page_or_article_words)) as page_or_article_words_diff,
sum(a.p_words) as previous_p_words,
sum(b.p_words) as p_words,
(sum(b.p_words) - sum(a.p_words)) as p_words_diff,
sum(a.p_count) as previous_p_count,
sum(b.p_count) as p_count,
(sum(b.p_count) - sum(a.p_count)) as p_count_diff,
sum(a.img_count) as previous_img_count,
sum(b.img_count) as img_count,
(sum(b.img_count) - sum(a.img_count)) as img_count_diff
from
(
select
(select crawl_id from changes where crawl_date < c.crawl_date order by crawl_date desc limit 1) previous_crawl_id,
crawl_id,
(select crawl_date from changes where crawl_date < c.crawl_date order by crawl_date desc limit 1) previous_crawl_date,
crawl_date
from changes c
group by crawl_id, crawl_date
order by max(crawl_date)
) x
left join changes b on x.crawl_id = b.crawl_id
left join changes a on b.page_uri = a.page_uri and x.previous_crawl_id = a.crawl_id
where x.previous_crawl_date is not null
group by x.crawl_id, x.crawl_date
order by x.crawl_date
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment