Created
April 12, 2020 19:16
-
-
Save lexi-lambda/9ab0ce6e572ae75b3acf668944abbfba to your computer and use it in GitHub Desktop.
A hacky script for scraping packages that use arrow notation from Hackage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#lang racket | |
(require db/base | |
db/sqlite3 | |
json | |
net/url | |
racket/async-channel | |
threading) | |
(define WORKERS 6) | |
(define cabal-path (find-executable-path "cabal")) | |
(define grep-path (find-executable-path "grep")) | |
(define (initialize-database!) | |
(unless (table-exists? the-package-db "packages") | |
(query-exec the-package-db "CREATE TABLE packages(name TEXT PRIMARY KEY, status TEXT)") | |
(query-exec the-package-db "CREATE INDEX packages_status ON packages(status)"))) | |
(define (fetch-package-list-from-hackage) | |
(define packages (call/input-url (string->url "https://hackage.haskell.org/packages/") | |
(λ (url) (get-pure-port url (list "Accept: application/json"))) | |
read-json)) | |
(for/list ([package (in-list packages)]) | |
(hash-ref package 'packageName))) | |
(define (prune-package-list packages) | |
(define values-expr (~> (for/list ([n (in-range (length packages))]) (~a "$" n)) | |
(string-join _ "),(") | |
(string-append "VALUES(" _ ")"))) | |
(apply query-list the-package-db | |
(~a "WITH all_packages(name) AS (" values-expr ") " | |
"SELECT all_packages.name FROM all_packages " | |
"LEFT JOIN packages ON packages.name = all_packages.name " | |
"WHERE packages.name IS NULL " | |
" OR packages.status = 'fail'") | |
packages)) | |
(define (package-unpack-dir package) | |
(build-path "/tmp/cabal-packages" package)) | |
(define (unpack-package package) | |
(define destination (package-unpack-dir package)) | |
(parameterize ([current-input-port (open-input-string "")] | |
[current-output-port (open-output-nowhere)]) | |
(system* cabal-path "get" "-d" destination package))) | |
(define (locate-package-dir package) | |
(define destination (package-unpack-dir package)) | |
(and (directory-exists? destination) | |
(for/first ([subdir (in-list (directory-list destination))] | |
#:when (string-prefix? (path->string subdir) package)) | |
(build-path destination subdir)))) | |
(define (get-package-dir package) | |
(or (locate-package-dir package) | |
(begin (unpack-package package) | |
(locate-package-dir package)))) | |
(define (process-worklist initial-worklist worker-proc progress-proc #:workers [workers WORKERS]) | |
(define worklist-box (box initial-worklist)) | |
(define (get-work-item!) | |
(let loop () | |
(match (unbox worklist-box) | |
[(and worklist (cons item items)) | |
(if (box-cas! worklist-box worklist items) | |
item | |
(loop))] | |
['() #f]))) | |
(define progress-chan (make-async-channel 100)) | |
(define progress-thread | |
(thread | |
(λ () | |
(let loop () | |
(match (async-channel-get progress-chan) | |
[`#s(result ,worker-id ,item ,result) | |
(progress-proc worker-id item result) | |
(loop)] | |
['done (void)]))))) | |
(define worker-threads | |
(for/list ([worker-id (in-range workers)]) | |
(thread | |
(λ () | |
(let loop () | |
(define item (get-work-item!)) | |
(when item | |
(define result (with-handlers ([exn:fail? values]) | |
(worker-proc worker-id item))) | |
(async-channel-put progress-chan `#s(result ,worker-id ,item ,result)) | |
(loop))))))) | |
(for-each thread-wait worker-threads) | |
(async-channel-put progress-chan 'done) | |
(thread-wait progress-thread)) | |
(define the-package-db (sqlite3-connect #:database "/tmp/packages.sqlite3" #:mode 'create #:use-place #t)) | |
(initialize-database!) | |
(define (do-scan-for-packages!) | |
(define the-package-list (fetch-package-list-from-hackage)) | |
(define pruned-package-list (prune-package-list the-package-list)) | |
(define num-packages (length the-package-list)) | |
(process-worklist | |
pruned-package-list | |
(λ (worker-id package) | |
(define dir (get-package-dir package)) | |
(parameterize ([current-input-port (open-input-string "")]) | |
(system* grep-path "-REq" "\\bArrows\\b" dir))) | |
(let ([processed (- num-packages (length pruned-package-list))]) | |
(λ (worker-id package result) | |
(set! processed (add1 processed)) | |
(define percentage (* (/ processed num-packages) 100)) | |
(query-exec the-package-db | |
"INSERT OR REPLACE INTO packages(name, status) VALUES($1, $2)" | |
package | |
(match result | |
[#t "yes"] | |
[#f "no"] | |
[(? exn?) "fail"])) | |
(printf "(~a%) ~a ~a\n" | |
(~r percentage #:precision '(= 2) #:min-width 6) | |
(match result | |
[#t " YES"] | |
[#f " NO"] | |
[(? exn?) "FAIL"]) | |
package) | |
(flush-output))))) | |
(define (find-banana-paths package) | |
(define dir (get-package-dir package)) | |
(define-values [paths-in paths-out] (make-pipe)) | |
(parameterize ([current-input-port (open-input-string "")] | |
[current-output-port paths-out]) | |
(system* grep-path "-RFl" "(|" dir)) | |
(close-output-port paths-out) | |
(for*/list ([matching-path-string (in-lines paths-in)] | |
[matching-path (in-value (simple-form-path matching-path-string))] | |
#:unless (member (path-get-extension matching-path) | |
'(#f #".a" #".bin" #".c" #".class" #".el" #".eot" #".exe" #".hi" #".html" #".idr" #".js" #".md" | |
#".mp3" #".o" #".pdf" #".png" #".p_o" #".properties" #".ps" #".rsc" #".rst" #".rtree" | |
#".str" #".tbl" #".ttf" #".xml")) | |
#:unless (let ([segments (map path->string (explode-path matching-path))]) | |
(ormap (λ (needle) (member needle segments)) | |
'("Agda-2.6.1" ; \ idiom brackets | |
"idris-1.3.2" ; / | |
"aern2-mp-0.1.4" ; \ LaTeX | |
"AlgorithmW-0.1.1.0" ; | | |
"frown-0.6.2.3" ; | | |
"HaTeX-3.22.2.0" ; | | |
"myTestlll-1.0.0" ; | | |
"TransformersStepByStep-0.1.1.0" ; / | |
"arrows-0.4.4.2" ; already broken | |
"Font.hs" ; \ file is raw data | |
"SfxGong.hs" ; / | |
; non-implementation code | |
".git" "examples" "native" "test" "tests" "testdata" | |
; only uses are parsing Haskell | |
"arrowp-0.5.0.2" "CCA-0.1.5.3" "fast-tags-2.0.0" "freesect-0.8" "ghc-8.6.5" | |
"ghc-exactprint-0.6.3" "ghc-lib-8.10.1.20200324" "ghc-lib-parser-8.10.1.20200324" | |
"haskell-src-exts-1.23.0" "haskell-tools-ast-1.1.1.0" "highlighting-kate-0.6.4" | |
"module-management-0.21" "ormolu-0.0.3.1" "skylighting-0.8.3.4" "visual-prof-0.5"))) | |
#:when (call-with-input-file* matching-path | |
(λ (matching-in) (regexp-match? #px"\\(\\|(?![-|<>#$*&=)])" matching-in)))) | |
matching-path)) | |
(define (do-search-for-bananas!) | |
(define arrow-packages (query-list the-package-db "SELECT name FROM packages WHERE status = 'yes' ORDER BY name ASC")) | |
(define num-packages (length arrow-packages)) | |
(process-worklist | |
arrow-packages | |
(λ (worker-id package) | |
(not (empty? (find-banana-paths package)))) | |
(let ([processed 0]) | |
(λ (worker-id package result) | |
(set! processed (add1 processed)) | |
(define percentage (* (/ processed num-packages) 100)) | |
(when (eq? result #t) | |
(query-exec the-package-db "UPDATE packages SET status = 'bananas' WHERE name = $1" package)) | |
(printf "(~a%) ~a ~a\n" | |
(~r percentage #:precision '(= 2) #:min-width 6) | |
(match result | |
[#t "YES"] | |
[#f " NO"]) | |
package) | |
(flush-output))))) | |
(define (print-banana-paths!) | |
(define banana-packages (query-list the-package-db "SELECT name FROM packages WHERE status = 'bananas' ORDER BY name ASC")) | |
(for* ([package (in-list banana-packages)] | |
[matching-path (in-list (find-banana-paths package))]) | |
(displayln (path->string matching-path)))) | |
;(do-scan-for-packages!) | |
;(do-search-for-bananas!) | |
;(print-banana-paths!) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment