Last active
June 28, 2019 21:39
-
-
Save pauljm/870f61f74a1c4491cf1e to your computer and use it in GitHub Desktop.
Quick and dirty utility to list all the objects in an S3 bucket with a certain prefix and, for any whose key matches a pattern, read the file line by line and print any lines that match a second pattern. Adjust constants as appropriate. Usage: sbt 'run <AWS access key ID> <AWS secret key>'
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
organization := "com.conspire" | |
name := "s3inspect" | |
libraryDependencies ++= Seq("com.amazonaws" % "aws-java-sdk" % "1.9.28.1") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Directory structure should be: | |
* - project_root | |
* - build.sbt (see below) | |
* - src | |
* - main | |
* - scala | |
* - S3Inspect.scala | |
*/ | |
package s3inspect | |
import scala.collection.JavaConversions._ | |
import java.io.{ InputStream, InputStreamReader, BufferedReader, ByteArrayInputStream } | |
import java.util.regex.Pattern | |
import com.amazonaws.auth.BasicAWSCredentials | |
import com.amazonaws.services.s3.AmazonS3Client | |
import com.amazonaws.services.s3.model._ | |
/** | |
* Quick and dirty utility to list all the objects in a bucket with a certain prefix | |
* and, for any whose key matches a pattern, read the file line by line and print | |
* any lines that match a second pattern. Adjust constants as appropriate. | |
* | |
* Usage: sbt 'run <AWS access key ID> <AWS secret key>' | |
*/ | |
object S3Inspect { | |
val BucketName = "my-bucket-name" | |
val ObjectPrefix = "prefix/to/haystack/" | |
val ObjectPattern = Pattern.compile(s"${ ObjectPrefix }[0-9]+\\.txt") | |
val LinePattern = Pattern.compile("a|(some)needles?") | |
def main(args: Array[String]) { | |
// AWS credentials are supplied at the command line | |
if (args.length != 2) { | |
println("Usage: sbt 'run <AWS access key ID> <AWS secret key>'") | |
System.exit(1) | |
} | |
val s3 = new AmazonS3Client( new BasicAWSCredentials( args(0), args(1) ) ) | |
inspectBucket(s3) | |
} | |
/** | |
* List objects with the specified prefix and, for those that match the (whole) object pattern, | |
* check lines against the line pattern | |
*/ | |
def inspectBucket(s3: AmazonS3Client) { | |
// Pool of 10 workers to process objecst in parallel | |
val tasksupport = new scala.collection.parallel.ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(10)) | |
// Keep listing until all objects listed | |
val listObjectsRequest = new ListObjectsRequest().withBucketName(BucketName).withPrefix(ObjectPrefix) | |
var listing: ObjectListing = null | |
var matchCount = 0 | |
do { | |
listing = s3.listObjects(listObjectsRequest) | |
// Use parallel collections to process several objects in parallel | |
val parallelSummaries = listing.getObjectSummaries.par | |
parallelSummaries.tasksupport = tasksupport | |
parallelSummaries.foreach { objectSummary => | |
// Inspect matching objects and track total objects inspected | |
if ( ObjectPattern.matcher( objectSummary.getKey ).matches ) { | |
inspectObject(s3, objectSummary.getKey) | |
matchCount += 1 | |
if (matchCount % 100 == 0) | |
println(s"...$matchCount matching objects processed...") | |
} | |
} | |
listObjectsRequest.setMarker(listing.getNextMarker) | |
} while ( listing.isTruncated ) | |
} | |
/** | |
* Read object and print lines that include (don't have to match completely) the line pattern | |
*/ | |
def inspectObject(s3: AmazonS3Client, key: String) { | |
val s3Object = s3.getObject(BucketName, key) | |
val is = s3Object.getObjectContent | |
val r = new InputStreamReader(is, "UTF-8") | |
val br = new BufferedReader(r) | |
var line: String = null | |
var afterEmptyLine = false | |
do { | |
line = br.readLine() | |
if (line != null && LinePattern.matcher( line ).find) { | |
println(s"$key: $line") | |
if (afterEmptyLine) | |
println(" ^^^ AFTER EMPTY LINE ^^^") | |
} else if (line == "") | |
afterEmptyLine = true | |
} while (line != null) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment