Skip to content

Instantly share code, notes, and snippets.

#!/bin/bash
TOKEN=$(gcloud auth application-default print-access-token)
PROJECT_ID="663188713804"
DOCUMENT_ID="3m69velh07nj0" # 05079-86913.pdf
SOURCE_FOLDER_ID="2n0md82gaqn28" # Unreviewed
DESTINATION_FOLDER_ID="5hrrjcq9h3150" # Reviewed
ENDPOINT="https://contentwarehouse.googleapis.com/v1/projects/$PROJECT_ID/locations/us/documents/$DOCUMENT_ID/linkedSources"
import math
from sklearn.cluster import DBSCAN
import numpy as np
import statistics
from collections import Counter
def group_by_visual_row(data, eps=5, min_samples=1):
# Cluster report rows using the DBSCAN clustering algorithm to group OCR lines
# DBSCAN (Density-Based Spatial Clustering of Applications with Noise) can help