Skip to content

Instantly share code, notes, and snippets.

@hamelsmu
Last active April 28, 2022 01:47
Show Gist options
  • Save hamelsmu/097a50bd9c7ee28465dcdd0bfd565532 to your computer and use it in GitHub Desktop.
Save hamelsmu/097a50bd9c7ee28465dcdd0bfd565532 to your computer and use it in GitHub Desktop.
Get Github Issues from Bigquery
-- Access this query at https://console.cloud.google.com/bigquery?sq=235037502967:a71a4b32d74442558a2739b581064e5f
SELECT url, title, body
FROM(
SELECT url, title, body
, RANK() OVER (PARTITION BY SUBSTR(body, 75, 125) ORDER BY url) as count_body_beg
FROM(
SELECT url, title, body
, RANK() OVER (PARTITION BY SUBSTR(body, 50, 100) ORDER BY url) as count_body_beg
FROM(
SELECT url, title, body
, RANK() OVER (PARTITION BY SUBSTR(body, 25, 75) ORDER BY url) as count_body_beg
FROM(
SELECT url, title, body
, RANK() OVER (PARTITION BY SUBSTR(body, 0, 50) ORDER BY url) as count_body_beg
FROM(
SELECT DISTINCT
url
-- replace more than one white-space character in a row with a single space
, REGEXP_REPLACE(title, r"\s{2,}", ' ') as title
, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
, RANK() OVER (PARTITION BY SUBSTR(title, 0, 25) ORDER BY url) as count_title_beg
-- , RANK() OVER (PARTITION BY SUBSTR(body, 0, 1000) ORDER BY url) as count_body_beg
FROM(
SELECT
JSON_EXTRACT(payload, '$.issue.html_url') as url
-- extract the title and body removing parentheses, brackets, and quotes
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body
FROM `githubarchive.day.2021*`
WHERE
-- ALL Of 2021
_TABLE_SUFFIX BETWEEN '0101' and '1231'
and type="IssuesEvent"
-- Only want the issue at a specific point otherwise will have duplicates
and JSON_EXTRACT(payload, '$.action') = "\"opened\""
UNION ALL
SELECT
JSON_EXTRACT(payload, '$.issue.html_url') as url
-- extract the title and body removing parentheses, brackets, and quotes
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body
FROM `githubarchive.day.2020*`
WHERE
-- ALL Of 2020
_TABLE_SUFFIX BETWEEN '0101' and '1231'
and type="IssuesEvent"
-- Only want the issue at a specific point otherwise will have duplicates
and JSON_EXTRACT(payload, '$.action') = "\"opened\""
) as tbl
WHERE
-- the body must be at least 8 words long and the title at least 3 words long
-- this is an arbitrary way to filter out empty or sparse issues
ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
-- filter out issues that have really long titles or bodies
-- (these are outliers, and will slow tokenization down).
and LENGTH(title) <= 400
and LENGTH(body) <= 2000
) tbl2
WHERE count_title_beg = 1
)tbl3
WHERE count_body_beg = 1
)tbl4
WHERE count_body_beg = 1
)tbl5
WHERE count_body_beg = 1
)tbl6
WHERE count_body_beg = 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment