hamelsmu · April 28, 2022 01:47
diff --git a/ghissues.sql b/ghissues.sql
 -- Access this query at https://console.cloud.google.com/bigquery?sq=235037502967:a71a4b32d74442558a2739b581064e5f

 SELECT url, title, body
 FROM(
 SELECT url, title, body
     , RANK() OVER (PARTITION BY SUBSTR(body, 75, 125) ORDER BY url) as count_body_beg
 FROM(
 SELECT url, title, body
   , RANK() OVER (PARTITION BY SUBSTR(body, 50, 100) ORDER BY url) as count_body_beg
 FROM(
 SELECT url, title, body
   , RANK() OVER (PARTITION BY SUBSTR(body, 25, 75) ORDER BY url) as count_body_beg
 FROM(
 SELECT url, title, body
 , RANK() OVER (PARTITION BY SUBSTR(body, 0, 50) ORDER BY url) as count_body_beg
 FROM(

    SELECT DISTINCT 
      url
      -- replace more than one white-space character in a row with a single space
    , REGEXP_REPLACE(title, r"\s{2,}", ' ') as title
    , REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
    , RANK() OVER (PARTITION BY SUBSTR(title, 0, 25) ORDER BY url) as count_title_beg
    -- , RANK() OVER (PARTITION BY SUBSTR(body, 0, 1000) ORDER BY url) as count_body_beg
    FROM(
        SELECT
            JSON_EXTRACT(payload, '$.issue.html_url') as url
            -- extract the title and body removing parentheses, brackets, and quotes
          , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title
          , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body
        FROM `githubarchive.day.2021*`
        WHERE 
          -- ALL Of 2021  
              _TABLE_SUFFIX BETWEEN '0101' and '1231'
          and type="IssuesEvent" 
          -- Only want the issue at a specific point otherwise will have duplicates
          and JSON_EXTRACT(payload, '$.action') = "\"opened\"" 
          UNION ALL 
              SELECT
            JSON_EXTRACT(payload, '$.issue.html_url') as url
            -- extract the title and body removing parentheses, brackets, and quotes
          , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as title
          , LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n|\(|\)|\[|\]|#|\*|`|\"", ' '))) as body
        FROM `githubarchive.day.2020*`
        WHERE 
          -- ALL Of 2020  
              _TABLE_SUFFIX BETWEEN '0101' and '1231'
          and type="IssuesEvent" 
          -- Only want the issue at a specific point otherwise will have duplicates
          and JSON_EXTRACT(payload, '$.action') = "\"opened\""

    ) as tbl

    WHERE 
      -- the body must be at least 8 words long and the title at least 3 words long
      --  this is an arbitrary way to filter out empty or sparse issues
          ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
      and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
      -- filter out issues that have really long titles or bodies
      --    (these are outliers, and will slow tokenization down).
      and LENGTH(title) <= 400
      and LENGTH(body) <= 2000
 ) tbl2
 WHERE count_title_beg = 1
 )tbl3
 WHERE count_body_beg = 1
 )tbl4
 WHERE count_body_beg = 1
 )tbl5
 WHERE count_body_beg = 1
 )tbl6
 WHERE count_body_beg = 1
	-- Access this query at https://console.cloud.google.com/bigquery?sq=235037502967:a71a4b32d74442558a2739b581064e5f

	SELECT url, title, body
	FROM(
	SELECT url, title, body
	, RANK() OVER (PARTITION BY SUBSTR(body, 75, 125) ORDER BY url) as count_body_beg
	FROM(
	SELECT url, title, body
	, RANK() OVER (PARTITION BY SUBSTR(body, 50, 100) ORDER BY url) as count_body_beg
	FROM(
	SELECT url, title, body
	, RANK() OVER (PARTITION BY SUBSTR(body, 25, 75) ORDER BY url) as count_body_beg
	FROM(
	SELECT url, title, body
	, RANK() OVER (PARTITION BY SUBSTR(body, 0, 50) ORDER BY url) as count_body_beg
	FROM(

	SELECT DISTINCT
	url
	-- replace more than one white-space character in a row with a single space
	, REGEXP_REPLACE(title, r"\s{2,}", ' ') as title
	, REGEXP_REPLACE(body, r"\s{2,}", ' ') as body
	, RANK() OVER (PARTITION BY SUBSTR(title, 0, 25) ORDER BY url) as count_title_beg
	-- , RANK() OVER (PARTITION BY SUBSTR(body, 0, 1000) ORDER BY url) as count_body_beg
	FROM(
	SELECT
	JSON_EXTRACT(payload, '$.issue.html_url') as url
	-- extract the title and body removing parentheses, brackets, and quotes
	, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n\|\(\|\)\|\[\|\]\|#\|\*\|`\|\"", ' '))) as title
	, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n\|\(\|\)\|\[\|\]\|#\|\*\|`\|\"", ' '))) as body
	FROM `githubarchive.day.2021*`
	WHERE
	-- ALL Of 2021
	_TABLE_SUFFIX BETWEEN '0101' and '1231'
	and type="IssuesEvent"
	-- Only want the issue at a specific point otherwise will have duplicates
	and JSON_EXTRACT(payload, '$.action') = "\"opened\""
	UNION ALL
	SELECT
	JSON_EXTRACT(payload, '$.issue.html_url') as url
	-- extract the title and body removing parentheses, brackets, and quotes
	, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.title'), r"\\n\|\(\|\)\|\[\|\]\|#\|\*\|`\|\"", ' '))) as title
	, LOWER(TRIM(REGEXP_REPLACE(JSON_EXTRACT(payload, '$.issue.body'), r"\\n\|\(\|\)\|\[\|\]\|#\|\*\|`\|\"", ' '))) as body
	FROM `githubarchive.day.2020*`
	WHERE
	-- ALL Of 2020
	_TABLE_SUFFIX BETWEEN '0101' and '1231'
	and type="IssuesEvent"
	-- Only want the issue at a specific point otherwise will have duplicates
	and JSON_EXTRACT(payload, '$.action') = "\"opened\""

	) as tbl

	WHERE
	-- the body must be at least 8 words long and the title at least 3 words long
	-- this is an arbitrary way to filter out empty or sparse issues
	ARRAY_LENGTH(SPLIT(body, ' ')) >= 6
	and ARRAY_LENGTH(SPLIT(title, ' ')) >= 3
	-- filter out issues that have really long titles or bodies
	-- (these are outliers, and will slow tokenization down).
	and LENGTH(title) <= 400
	and LENGTH(body) <= 2000
	) tbl2
	WHERE count_title_beg = 1
	)tbl3
	WHERE count_body_beg = 1
	)tbl4
	WHERE count_body_beg = 1
	)tbl5
	WHERE count_body_beg = 1
	)tbl6
	WHERE count_body_beg = 1