Last active
August 17, 2017 04:59
-
-
Save steren/f661ee006c2fff52ab9fd44d9570fc7e to your computer and use it in GitHub Desktop.
Look at constant Go regular expressions on GitHub and see if they match an optim
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Analyse impact of https://github.com/golang/go/issues/21463 | |
| # using BigQuery GitHub public dataset. | |
| # To run on the entire GitHub corpus, | |
| # remove the `sample_` prefix from the table names. | |
| # Warning: This query processes ~2.2 TB of data, which is above BigQuery free quota. | |
| SELECT | |
| applies, | |
| COUNT(*) AS cnt | |
| FROM ( | |
| SELECT | |
| CASE | |
| WHEN REGEXP_MATCH(reg, r'[\*\+\?]') THEN "no: [\\*\\+\\?]" | |
| WHEN REGEXP_MATCH(reg, r'\^[\w\d\s\.]+$') THEN "yes: \\^[\\w\\d\\s\\.]+$" | |
| WHEN REGEXP_MATCH(reg, r'[\w\d\s\.]+\$') THEN "yes: [\\w\\d\\s\\.]+\\$" | |
| WHEN REGEXP_MATCH(reg, r'^([\w\d\s]+\|?)+$') THEN "yes: ^([\\w\\d\\s]+\\|?)+$" | |
| WHEN REGEXP_MATCH(reg, r'^[^\^\$]+$') THEN "no: ^[^\\^\\$]+$" | |
| ELSE "maybe" | |
| END AS applies, | |
| reg | |
| FROM ( | |
| SELECT | |
| REGEXP_EXTRACT(content, r'.MatchString\("(.*)", ') AS reg | |
| FROM ( | |
| SELECT | |
| id, | |
| SPLIT(content, "regexp") AS content | |
| FROM | |
| [bigquery-public-data:github_repos.sample_contents] | |
| WHERE | |
| REGEXP_MATCH(content, r'regexp.MatchString\("')) AS C | |
| JOIN ( | |
| SELECT | |
| id | |
| FROM | |
| [bigquery-public-data:github_repos.sample_files] | |
| WHERE | |
| path LIKE '%.go' | |
| GROUP BY | |
| id) AS F | |
| ON | |
| C.id = F.id) ) | |
| WHERE reg != "null" | |
| GROUP BY | |
| applies | |
| ORDER BY | |
| applies DESC |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
For anyone wanting to run this query - try extracting all the .go files first, or use the extract I left at https://bigquery.cloud.google.com/table/fh-bigquery:github_extracts.contents_go.
(avoid querying [bigquery-public-data:github_repos.sample_contents] as it's too big, and full of not .go content)