Skip to content

Instantly share code, notes, and snippets.

@milimetric
Last active July 21, 2016 14:01
Show Gist options
  • Save milimetric/004a8d05907605e4d3fe1c8cfacc9cea to your computer and use it in GitHub Desktop.
Save milimetric/004a8d05907605e4d3fe1c8cfacc9cea to your computer and use it in GitHub Desktop.
-- join oldTitle, oldNamespace, timestamp to title, namespace, end and vice versa, should match perfectly
select sum(if(log_title is null, 1, 0)) as history_without_log_line,
sum(if(page_title is null, 1, 0)) as log_line_without_history
from mforns.simplewiki_page_history
FULL OUTER JOIN
milimetric.simplewiki_logging ON cast(log_title as string) = page_title
AND log_namespace = page_namespace
AND cast(log_timestamp as string) = `end`
AND `end` is not null
;
-- returns 0, 0
-- group by title, namespace, end, should never be duplicated
select page_title,
page_namespace,
`end`,
count(*)
from mforns.simplewiki_page_history
group by
page_title,
page_namespace,
`end`
having count(*) > 1
;
-- returns 24100 rows
-- join pageId, end is null, page_create_timestamp to page_id, rev_timestamp for rev_parent_id = 0 and vice versa, ditto
select sum(if(page_id is null, 1, 0)) as created_page_without_history,
sum(if(rev_page is null, 1, 0)) as history_without_created_page
from mforns.simplewiki_page_history
FULL OUTER JOIN
milimetric.simplewiki_revision ON page_id = rev_page
AND page_creation_timestamp = cast(rev_timestamp as string)
AND rev_parent_id = 0
AND `end` is null
AND page_creation_timestamp is not null
AND page_id is not null
;
-- returns 4866731, 326601
-- simpler way to check some of the above
select p.page_id
from milimetric.simplewiki_page p
left join
mforns.simplewiki_page_history h ON h.page_id = p.page_id
where h.page_id is null
;
-- returns 0 rows
select h.page_id
from mforns.simplewiki_page_history h
left join
milimetric.simplewiki_page p ON p.page_id = h.page_id
where h.page_id is not null
and p.page_id is null
;
-- returns 0 rows
select h.page_id,
h.page_creation_timestamp
from (select distinct page_id, page_creation_timestamp
from mforns.simplewiki_page_history
where page_id is not null
and `end` is null
) h
left join
milimetric.simplewiki_page p ON p.page_id = h.page_id
where p.page_id is null
;
-- returns 0 rows
-- general probing query
select coalesce(page_creation_timestamp, ' none ') as creation,
coalesce(start, ' none ') as start_time,
coalesce(`end`, ' none ') as end_time,
caused_by_event_type as event,
page_id,
page_namespace as ns,
page_title
from mforns.simplewiki_page_history
where page_title = 'Main_Page'
and page_namespace = 0
order by
page_id,
start_time
limit 1000
;
select coalesce(page_creation_timestamp, ' none ') as creation,
coalesce(start, ' none ') as start_time,
coalesce(`end`, ' none ') as end_time,
caused_by_event_type as event,
page_id,
page_namespace as ns,
page_title
from mforns.simplewiki_page_history
where page_title = 'Xbox'
and page_namespace = 1
order by
page_id,
start_time
limit 1000
;
select coalesce(page_creation_timestamp, ' none ') as creation,
coalesce(start, ' none ') as start_time,
coalesce(`end`, ' none ') as end_time,
caused_by_event_type as event,
page_id,
page_namespace as ns,
page_title,
page_id_artificial
from mforns.simplewiki_page_history
where page_id = 232335
order by
page_id,
start_time
limit 1000
;
select coalesce(page_creation_timestamp, ' none ') as creation,
coalesce(start, ' none ') as start_time,
coalesce(`end`, ' none ') as end_time,
caused_by_event_type as event,
page_id,
page_namespace as ns,
page_title,
page_id_artificial
from mforns.simplewiki_page_history
where page_id = 26975
order by
page_id,
start_time
limit 1000
;
select coalesce(page_creation_timestamp, ' none ') as creation,
coalesce(start, ' none ') as start_time,
coalesce(`end`, ' none ') as end_time,
caused_by_event_type as event,
page_id,
page_namespace as ns,
page_title,
page_id_artificial
from mforns.simplewiki_page_history
where page_id = 459354
order by
page_id,
start_time
limit 1000
;
select *
from logging
where log_type = 'delete' and log_action = 'delete'
and log_namespace = 0
and (
cast(log_title as char(100)) COLLATE utf8_general_ci in ('Main_Page', 'Main_Page/Prerelease')
or rtrim(replace(cast(log_params as char(1000)), '\n', '')) COLLATE utf8_general_ci in ('Main Page', 'Main_Page/Prerelease')
)
;
select count(*)
from mforns.simplewiki_page_history h
inner join
milimetric.simplewiki_revision r on r.rev_page = h.page_id
and cast(r.rev_timestamp as string) = h.page_creation_timestamp
and r.rev_parent_id = 0
and h.caused_by_event_type = 'create'
;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment