Last active
April 28, 2022 01:47
-
-
Save hamelsmu/097a50bd9c7ee28465dcdd0bfd565532 to your computer and use it in GitHub Desktop.
Get Github Issues from Bigquery
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- Access this query at https://console.cloud.google.com/bigquery?sq=235037502967:a71a4b32d74442558a2739b581064e5f | |
SELECT url, title, body | |
FROM( | |
SELECT url, title, body | |
, RANK() OVER (PARTITION BY SUBSTR(body, 75, 125) ORDER BY url) as count_body_beg | |
FROM( | |
SELECT url, title, body | |
, RANK() OVER (PARTITION BY SUBSTR(body, 50, 100) ORDER BY url) as count_body_beg | |
FROM( | |
SELECT url, title, body | |
, RANK() OVER (PARTITION BY SUBSTR(body, 25, 75) ORDER BY url) as count_body_beg | |
FROM( | |
SELECT url, title, body | |
, RANK() OVER (PARTITION BY SUBSTR(body, 0, 50) ORDER BY url) as count_body_beg | |
FROM( | |
SELECT DISTINCT | |
url | |
-- replace more than one white-space character in a row with a single space | |
, REGEXP_REPLACE(title, r"\s{2,}", ' ') as title | |
, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body | |
, RANK() OVER (PARTITION BY SUBSTR(title, 0, 25) ORDER BY url) as count_title_beg | |
-- , RANK() OVER (PARTITION BY SUBSTR(body, 0, 1000) ORDER BY url) as count_body_beg | |
FROM( | |
SELECT | |
JSON_EXTRACT(payload, '$.issue.html_url') as url | |
-- extract the title and body removing parentheses, brackets, and quotes | |
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title | |
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body | |
FROM `githubarchive.day.2021*` | |
WHERE | |
-- ALL Of 2021 | |
_TABLE_SUFFIX BETWEEN '0101' and '1231' | |
and type="IssuesEvent" | |
-- Only want the issue at a specific point otherwise will have duplicates | |
and JSON_EXTRACT(payload, '$.action') = "\"opened\"" | |
UNION ALL | |
SELECT | |
JSON_EXTRACT(payload, '$.issue.html_url') as url | |
-- extract the title and body removing parentheses, brackets, and quotes | |
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title | |
, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body | |
FROM `githubarchive.day.2020*` | |
WHERE | |
-- ALL Of 2020 | |
_TABLE_SUFFIX BETWEEN '0101' and '1231' | |
and type="IssuesEvent" | |
-- Only want the issue at a specific point otherwise will have duplicates | |
and JSON_EXTRACT(payload, '$.action') = "\"opened\"" | |
) as tbl | |
WHERE | |
-- the body must be at least 8 words long and the title at least 3 words long | |
-- this is an arbitrary way to filter out empty or sparse issues | |
ARRAY_LENGTH(SPLIT(body, ' ')) >= 6 | |
and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3 | |
-- filter out issues that have really long titles or bodies | |
-- (these are outliers, and will slow tokenization down). | |
and LENGTH(title) <= 400 | |
and LENGTH(body) <= 2000 | |
) tbl2 | |
WHERE count_title_beg = 1 | |
)tbl3 | |
WHERE count_body_beg = 1 | |
)tbl4 | |
WHERE count_body_beg = 1 | |
)tbl5 | |
WHERE count_body_beg = 1 | |
)tbl6 | |
WHERE count_body_beg = 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment