Last active
March 27, 2020 17:45
-
-
Save balvinder294/b8f4c62f704e42bc109de8a0a8dd217b to your computer and use it in GitHub Desktop.
Extract Data from Any Document like Resume, Biil, or any Form with AWS Textract Synchronous Snippet --- Tekraze.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //Imports for packages used | |
| import com.amazonaws.auth.AWSStaticCredentialsProvider; | |
| import com.amazonaws.auth.BasicAWSCredentials; | |
| import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration; | |
| import com.amazonaws.services.textract.AmazonTextract; | |
| import com.amazonaws.services.textract.AmazonTextractClientBuilder; | |
| import com.amazonaws.services.textract.model.AnalyzeDocumentRequest; | |
| import com.amazonaws.services.textract.model.AnalyzeDocumentResult; | |
| import com.amazonaws.services.textract.model.Document; | |
| import com.amazonaws.services.textract.model.S3Object; | |
| // | |
| /******************************** Step 1 | |
| Set credentials and create a credential object for authorization */ | |
| String access_key_id = "YOUR_APP_ID"; | |
| String access_secret_key = "YOUR_APP_KEY"; | |
| BasicAWSCredentials awsCreds = new BasicAWSCredentials(access_key_id, access_secret_key); | |
| AWSStaticCredentialsProvider awsStaticCredentialsProvider = new AWSStaticCredentialsProvider(awsCreds); | |
| /**************************** Step 2 | |
| Create Endpoint URL for Service you want to use | |
| here we using textract at us-east-2 | |
| */ | |
| String region = "us-east-2"; | |
| String regionUrl = "https://textract.us-east-2.amazonaws.com"; | |
| EndpointConfiguration endpoint = new EndpointConfiguration(regionUrl, region); | |
| /*************************************************** Step 3 | |
| Intialize The Textract Client | |
| */ | |
| AmazonTextract textractClient = AmazonTextractClientBuilder.standard() | |
| .withCredentials(awsStaticCredentialsProvider) | |
| .withEndpointConfiguration(endpoint) | |
| .build(); | |
| /******************************************** Step 4 | |
| Create a Document from Either S3 Object or from a local file | |
| */ | |
| /********** Using S3 Object*************/ | |
| /*** S3 object like PDF, PNG, JPG ***/ | |
| // Initialize Document | |
| // Intialize S3 Object | |
| S3Object s3Object = new S3Object() | |
| .withBucket(bucketName) // Name of bucket where object is stored | |
| .withName(objectName); // Name of Object to get | |
| // Intialize document | |
| Document doc = new Document() | |
| .withS3Object(s3Object); | |
| /************* Using Local Document converted as Image *****************/ | |
| // Read as byte[] from the input file | |
| byte[] fileAsBytes = file.getBytes(); | |
| // Intialize document and pass byte[] from file | |
| Document document = new Document() | |
| .withBytes(ByteBuffer.wrap(fileAsBytes)); | |
| /********************** Step 5************* | |
| * Initialize AnalyzeDocument Request | |
| */ | |
| AnalyzeDocumentRequest request = new AnalyzeDocumentRequest() | |
| .withFeatureTypes("TABLES","FORMS") // Features to get or anyone from table to form | |
| .withDocument(document); // the document we created in step 4 | |
| /************************Step 6********************* | |
| * Initialize AnlyzeDocument Result | |
| */ | |
| AnalyzeDocumentResult result = textractClient | |
| .analyzeDocument(request); | |
| /**************You can now use this result ******************/ | |
| /****************** Step 7*************************** | |
| *Getting Data from Blocks | |
| Blocks are the Nested data having data of a Page, a line, or a word and their relationships. | |
| ***/ | |
| // List of Blocks | |
| List<com.amazonaws.services.textract.model.Block> blocks = result.getBlocks(); | |
| // Get Specific data by iterating blocks List | |
| blocks.forEach( | |
| block -> { // Single block | |
| System.out.println(block.getBlockType()); // Type like LINE, WORD | |
| System.out.println(block.getText()); // Text from the block from file | |
| System.out.println(block.getConfidence()); // Confidence of data extracted is good | |
| System.out.println(block.getEntityTypes()); // Entity Types like Key or Value or something else | |
| System.out.println(block.getRelationships()); // Relationship between blocks like A key may link to a value | |
| } | |
| ); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment