Skip to content

Instantly share code, notes, and snippets.

@mdoering
Last active August 29, 2015 14:06
Show Gist options
  • Save mdoering/02dcd749539ae060a861 to your computer and use it in GitHub Desktop.
Save mdoering/02dcd749539ae060a861 to your computer and use it in GitHub Desktop.
ValidationReport for both checklists and occurrences
package org.gbif.api.model.crawler;
import java.util.List;
import com.google.common.base.Objects;
public class ChecklistValidationReport {
// the number of records checked in the validation
private final int checkedRecords;
// false if we had to stop at our memory-saving limit
private final boolean allRecordsChecked;
// a sample of non unique core/taxon identifier
private final List<String> duplicateIds;
// a sample of data file line numbers with records missing a core/taxon identifier
private final List<Integer> missingIds;
// if the archive is not valid this will hold a readable reason
private String invalidationReason;
// is this archive valid
private final boolean valid;
public ChecklistValidationReport(int checkedRecords, boolean allRecordsChecked, List<String> duplicateIds,
List<Integer> missingIds) {
this.checkedRecords = checkedRecords;
this.allRecordsChecked = allRecordsChecked;
this.duplicateIds = duplicateIds;
this.missingIds = missingIds;
this.valid = validate();
}
private boolean validate() {
if (!duplicateIds.isEmpty()) {
invalidationReason = "Non unique taxon ids";
return false;
}
if (!missingIds.isEmpty()) {
invalidationReason = "Missing taxon ids";
return false;
}
return true;
}
public int getCheckedRecords() {
return checkedRecords;
}
public boolean isAllRecordsChecked() {
return allRecordsChecked;
}
public List<String> getDuplicateIds() {
return duplicateIds;
}
public List<Integer> getMissingIds() {
return missingIds;
}
public String getInvalidationReason() {
return invalidationReason;
}
public boolean isValid() {
return valid;
}
@Override
public int hashCode() {
return Objects.hashCode(checkedRecords, allRecordsChecked, duplicateIds, missingIds, invalidationReason,valid);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
final ChecklistValidationReport other = (ChecklistValidationReport) obj;
return Objects.equal(this.checkedRecords, other.checkedRecords)
&& Objects.equal(this.allRecordsChecked, other.allRecordsChecked)
&& Objects.equal(this.duplicateIds, other.duplicateIds)
&& Objects.equal(this.missingIds, other.missingIds)
&& Objects.equal(this.invalidationReason, other.invalidationReason)
&& Objects.equal(this.valid, other.valid);
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("checkedRecords", checkedRecords)
.add("duplicateIds", duplicateIds)
.add("missingIds", missingIds)
.add("allRecordsChecked", allRecordsChecked)
.add("invalidationReason", invalidationReason)
.add("valid", valid)
.toString();
}
}
package org.gbif.api.model.crawler;
import java.util.List;
import com.google.common.base.Objects;
public class ChecklistValidationReport {
// the number of records checked in the validation
private final int checkedRecords;
// false if we had to stop at our memory-saving limit
private final boolean allRecordsChecked;
// a sample of non unique core/taxon identifier
private final List<String> duplicateIds;
// a sample of data file line numbers with records missing a core/taxon identifier
private final List<Integer> missingIds;
// if the archive is not valid this will hold a readable reason
private String invalidationReason;
// is this archive valid
private final boolean valid;
public ChecklistValidationReport(int checkedRecords, boolean allRecordsChecked, List<String> duplicateIds,
List<Integer> missingIds) {
this.checkedRecords = checkedRecords;
this.allRecordsChecked = allRecordsChecked;
this.duplicateIds = duplicateIds;
this.missingIds = missingIds;
this.valid = validate();
}
private boolean validate() {
if (!duplicateIds.isEmpty()) {
invalidationReason = "Non unique taxon ids";
return false;
}
if (!missingIds.isEmpty()) {
invalidationReason = "Missing taxon ids";
return false;
}
return true;
}
public int getCheckedRecords() {
return checkedRecords;
}
public boolean isAllRecordsChecked() {
return allRecordsChecked;
}
public List<String> getDuplicateIds() {
return duplicateIds;
}
public List<Integer> getMissingIds() {
return missingIds;
}
public String getInvalidationReason() {
return invalidationReason;
}
public boolean isValid() {
return valid;
}
@Override
public int hashCode() {
return Objects.hashCode(checkedRecords, allRecordsChecked, duplicateIds, missingIds, invalidationReason,valid);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
final ChecklistValidationReport other = (ChecklistValidationReport) obj;
return Objects.equal(this.checkedRecords, other.checkedRecords)
&& Objects.equal(this.allRecordsChecked, other.allRecordsChecked)
&& Objects.equal(this.duplicateIds, other.duplicateIds)
&& Objects.equal(this.missingIds, other.missingIds)
&& Objects.equal(this.invalidationReason, other.invalidationReason)
&& Objects.equal(this.valid, other.valid);
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("checkedRecords", checkedRecords)
.add("duplicateIds", duplicateIds)
.add("missingIds", missingIds)
.add("allRecordsChecked", allRecordsChecked)
.add("invalidationReason", invalidationReason)
.add("valid", valid)
.toString();
}
}
package org.gbif.api.model.crawler;
import java.util.List;
import java.util.UUID;
import javax.annotation.concurrent.Immutable;
import javax.annotation.concurrent.ThreadSafe;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;
import static com.google.common.base.Preconditions.checkNotNull;
/**
* A report of the state of a dwc archive, mainly testing the existance and use of unique identifiers.
* It combines checklist and occurrence reports.
*/
@Immutable
@ThreadSafe
public class DwcaValidationReport {
private final UUID datasetKey;
private final OccurrenceValidationReport occurrenceReport;
private final ChecklistValidationReport checklistReport;
public boolean isValid() {
return occurrenceReport.isValid() && checklistReport.isValid();
}
public DwcaValidationReport(@JsonProperty("datasetKey") UUID datasetKey, OccurrenceValidationReport occurrenceReport,
ChecklistValidationReport checklistReport) {
this.datasetKey = checkNotNull(datasetKey, "datasetKey can't be null");
this.occurrenceReport = occurrenceReport;
this.checklistReport = checklistReport;
}
public UUID getDatasetKey() {
return datasetKey;
}
public String getInvalidationReason() {
StringBuilder sb = new StringBuilder();
if (occurrenceReport.isValid()) {
sb.append("Invalid Occurrences: ");
sb.append(occurrenceReport.getInvalidationReason());
}
if (checklistReport.isValid()) {
if (sb.length() > 1) {
sb.append("\n");
}
sb.append("Invalid Checklist: ");
sb.append(checklistReport.getInvalidationReason());
}
return sb.toString();
}
}
package org.gbif.api.model.crawler;
import java.util.List;
import java.util.UUID;
import javax.annotation.concurrent.Immutable;
import javax.annotation.concurrent.ThreadSafe;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;
import static com.google.common.base.Preconditions.checkNotNull;
/**
* A report of the state of a dwc archive, mainly testing the existance and use of unique identifiers.
* It combines checklist and occurrence reports.
*/
@Immutable
@ThreadSafe
public class DwcaValidationReport {
private final UUID datasetKey;
private final OccurrenceValidationReport occurrenceReport;
private final ChecklistValidationReport checklistReport;
public boolean isValid() {
return occurrenceReport.isValid() && checklistReport.isValid();
}
public DwcaValidationReport(@JsonProperty("datasetKey") UUID datasetKey, OccurrenceValidationReport occurrenceReport,
ChecklistValidationReport checklistReport) {
this.datasetKey = checkNotNull(datasetKey, "datasetKey can't be null");
this.occurrenceReport = occurrenceReport;
this.checklistReport = checklistReport;
}
public UUID getDatasetKey() {
return datasetKey;
}
public String getInvalidationReason() {
StringBuilder sb = new StringBuilder();
if (occurrenceReport.isValid()) {
sb.append("Invalid Occurrences: ");
sb.append(occurrenceReport.getInvalidationReason());
}
if (checklistReport.isValid()) {
if (sb.length() > 1) {
sb.append("\n");
}
sb.append("Invalid Checklist: ");
sb.append(checklistReport.getInvalidationReason());
}
return sb.toString();
}
}
package org.gbif.api.model.crawler;
import java.util.List;
import java.util.UUID;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;
import static com.google.common.base.Preconditions.checkNotNull;
/**
* The rules followed here should match the document at:
* http://dev.gbif.org/wiki/display/INT/Identifier+problems+and+how+to+solve+them.
*/
public class OccurrenceValidationReport {
// if the percentage of invalid triplets (eg missing catalog number) is greater than this, the archive is invalid
private static final double INVALID_TRIPLET_THRESHOLD = 0.25;
// the number of occurrence records checked in the validation
private final int checkedRecords;
// the number of triplets that were unique
private final int uniqueTriplets;
/**
* the number of triplets that were invalid (because one or more of institutionCode, collectionCode or catalogNumber
* were null or empty)
*/
private final int recordsWithInvalidTriplets;
// the number of occurrenceIds that were unique (therefore also == the number of records with unique occurrenceId)
private final int uniqueOccurrenceIds;
// records that had no occurrenceId
private final int recordsMissingOccurrenceId;
// false if we had to stop at our memory-saving limit
private final boolean allRecordsChecked;
// if the archive is not valid this will hold a readable reason
private String invalidationReason;
// is this archive valid
private final boolean valid;
@JsonCreator
public OccurrenceValidationReport(@JsonProperty("checkedRecords") int checkedRecords,
@JsonProperty("uniqueTriplets") int uniqueTriplets,
@JsonProperty("invalidTriplets") int recordsWithInvalidTriplets,
@JsonProperty("uniqueOccIds") int uniqueOccurrenceIds,
@JsonProperty("missingOccIds") int recordsMissingOccurrenceId,
@JsonProperty("allRecordsChecked") boolean allRecordsChecked) {
this.checkedRecords = checkedRecords;
this.uniqueTriplets = uniqueTriplets;
this.recordsWithInvalidTriplets = recordsWithInvalidTriplets;
this.uniqueOccurrenceIds = uniqueOccurrenceIds;
this.recordsMissingOccurrenceId = recordsMissingOccurrenceId;
this.allRecordsChecked = allRecordsChecked;
this.valid = validate();
}
/**
* At the moment the only truly fatal conditions are:
* - whole archive is empty or unreadable
* - triplets are invalid (% invalid > than our threshold) && occIds are invalid (must be 100% coverage and unique)
* - any duplicate triplets && occIds are invalid
*/
private boolean validate() {
boolean hasRecords = checkedRecords > 0;
double invalidRatio = hasRecords ? (double) recordsWithInvalidTriplets / checkedRecords : 0;
boolean invalidTripletsBelowLimit = invalidRatio <= INVALID_TRIPLET_THRESHOLD;
boolean hasUniqueTriplets = uniqueTriplets == checkedRecords - recordsWithInvalidTriplets;
boolean hasUniqueOccIds = uniqueOccurrenceIds == checkedRecords - recordsMissingOccurrenceId;
boolean hasGoodOccIds = uniqueOccurrenceIds == checkedRecords;
boolean looksValid = hasRecords && (invalidTripletsBelowLimit && hasUniqueTriplets || hasGoodOccIds);
if (!looksValid) {
List<String> reasons = Lists.newArrayList();
if (!hasRecords) {
reasons.add("No readable records");
}
if (!invalidTripletsBelowLimit) {
reasons.add(Math.round(100 * invalidRatio) + "% invalid triplets is > than threshold of " + Math
.round(100 * INVALID_TRIPLET_THRESHOLD) + '%');
}
if (!hasUniqueTriplets) {
reasons.add((checkedRecords - recordsWithInvalidTriplets - uniqueTriplets) + " duplicate triplets detected");
}
if (!hasGoodOccIds) {
if (recordsMissingOccurrenceId != 0) {
reasons.add(recordsMissingOccurrenceId + " records without an occurrence id (should be 0)");
}
if (!hasUniqueOccIds) {
reasons.add(
(checkedRecords - recordsMissingOccurrenceId - uniqueOccurrenceIds) + " duplicate occurrence ids detected");
}
}
String reason = Joiner.on("; ").join(reasons);
invalidationReason = "Archive invalid because [" + reason + ']';
}
return looksValid;
}
public int getCheckedRecords() {
return checkedRecords;
}
public int getUniqueTriplets() {
return uniqueTriplets;
}
public int getRecordsWithInvalidTriplets() {
return recordsWithInvalidTriplets;
}
public int getUniqueOccurrenceIds() {
return uniqueOccurrenceIds;
}
public int getRecordsMissingOccurrenceId() {
return recordsMissingOccurrenceId;
}
public boolean isAllRecordsChecked() {
return allRecordsChecked;
}
public String getInvalidationReason() {
return invalidationReason;
}
public boolean isValid() {
return valid;
}
@Override
public int hashCode() {
return Objects.hashCode(checkedRecords, uniqueTriplets, recordsWithInvalidTriplets, uniqueOccurrenceIds,
recordsMissingOccurrenceId, allRecordsChecked, valid);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
final OccurrenceValidationReport other = (OccurrenceValidationReport) obj;
return Objects.equal(this.checkedRecords, other.checkedRecords)
&& Objects.equal(this.uniqueTriplets, other.uniqueTriplets)
&& Objects.equal(this.recordsWithInvalidTriplets, other.recordsWithInvalidTriplets)
&& Objects.equal(this.uniqueOccurrenceIds, other.uniqueOccurrenceIds)
&& Objects.equal(this.recordsMissingOccurrenceId, other.recordsMissingOccurrenceId)
&& Objects.equal(this.allRecordsChecked, other.allRecordsChecked)
&& Objects.equal(this.valid, other.valid);
}
@Override
public String toString() {
return Objects.toStringHelper(this).add("checkedRecords", checkedRecords)
.add("uniqueTriplets", uniqueTriplets).add("recordsWithInvalidTriplets", recordsWithInvalidTriplets)
.add("uniqueOccurrenceIds", uniqueOccurrenceIds).add("recordsMissingOccurrenceId", recordsMissingOccurrenceId)
.add("allRecordsChecked", allRecordsChecked).add("invalidationReason", invalidationReason).add("valid", valid)
.toString();
}
}
package org.gbif.api.model.crawler;
import java.util.List;
import java.util.UUID;
import com.google.common.base.Joiner;
import com.google.common.base.Objects;
import com.google.common.collect.Lists;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;
import static com.google.common.base.Preconditions.checkNotNull;
/**
* The rules followed here should match the document at:
* http://dev.gbif.org/wiki/display/INT/Identifier+problems+and+how+to+solve+them.
*/
public class OccurrenceValidationReport {
// if the percentage of invalid triplets (eg missing catalog number) is greater than this, the archive is invalid
private static final double INVALID_TRIPLET_THRESHOLD = 0.25;
// the number of occurrence records checked in the validation
private final int checkedRecords;
// the number of triplets that were unique
private final int uniqueTriplets;
/**
* the number of triplets that were invalid (because one or more of institutionCode, collectionCode or catalogNumber
* were null or empty)
*/
private final int recordsWithInvalidTriplets;
// the number of occurrenceIds that were unique (therefore also == the number of records with unique occurrenceId)
private final int uniqueOccurrenceIds;
// records that had no occurrenceId
private final int recordsMissingOccurrenceId;
// false if we had to stop at our memory-saving limit
private final boolean allRecordsChecked;
// if the archive is not valid this will hold a readable reason
private String invalidationReason;
// is this archive valid
private final boolean valid;
@JsonCreator
public OccurrenceValidationReport(@JsonProperty("checkedRecords") int checkedRecords,
@JsonProperty("uniqueTriplets") int uniqueTriplets,
@JsonProperty("invalidTriplets") int recordsWithInvalidTriplets,
@JsonProperty("uniqueOccIds") int uniqueOccurrenceIds,
@JsonProperty("missingOccIds") int recordsMissingOccurrenceId,
@JsonProperty("allRecordsChecked") boolean allRecordsChecked) {
this.checkedRecords = checkedRecords;
this.uniqueTriplets = uniqueTriplets;
this.recordsWithInvalidTriplets = recordsWithInvalidTriplets;
this.uniqueOccurrenceIds = uniqueOccurrenceIds;
this.recordsMissingOccurrenceId = recordsMissingOccurrenceId;
this.allRecordsChecked = allRecordsChecked;
this.valid = validate();
}
/**
* At the moment the only truly fatal conditions are:
* - whole archive is empty or unreadable
* - triplets are invalid (% invalid > than our threshold) && occIds are invalid (must be 100% coverage and unique)
* - any duplicate triplets && occIds are invalid
*/
private boolean validate() {
boolean hasRecords = checkedRecords > 0;
double invalidRatio = hasRecords ? (double) recordsWithInvalidTriplets / checkedRecords : 0;
boolean invalidTripletsBelowLimit = invalidRatio <= INVALID_TRIPLET_THRESHOLD;
boolean hasUniqueTriplets = uniqueTriplets == checkedRecords - recordsWithInvalidTriplets;
boolean hasUniqueOccIds = uniqueOccurrenceIds == checkedRecords - recordsMissingOccurrenceId;
boolean hasGoodOccIds = uniqueOccurrenceIds == checkedRecords;
boolean looksValid = hasRecords && (invalidTripletsBelowLimit && hasUniqueTriplets || hasGoodOccIds);
if (!looksValid) {
List<String> reasons = Lists.newArrayList();
if (!hasRecords) {
reasons.add("No readable records");
}
if (!invalidTripletsBelowLimit) {
reasons.add(Math.round(100 * invalidRatio) + "% invalid triplets is > than threshold of " + Math
.round(100 * INVALID_TRIPLET_THRESHOLD) + '%');
}
if (!hasUniqueTriplets) {
reasons.add((checkedRecords - recordsWithInvalidTriplets - uniqueTriplets) + " duplicate triplets detected");
}
if (!hasGoodOccIds) {
if (recordsMissingOccurrenceId != 0) {
reasons.add(recordsMissingOccurrenceId + " records without an occurrence id (should be 0)");
}
if (!hasUniqueOccIds) {
reasons.add(
(checkedRecords - recordsMissingOccurrenceId - uniqueOccurrenceIds) + " duplicate occurrence ids detected");
}
}
String reason = Joiner.on("; ").join(reasons);
invalidationReason = "Archive invalid because [" + reason + ']';
}
return looksValid;
}
public int getCheckedRecords() {
return checkedRecords;
}
public int getUniqueTriplets() {
return uniqueTriplets;
}
public int getRecordsWithInvalidTriplets() {
return recordsWithInvalidTriplets;
}
public int getUniqueOccurrenceIds() {
return uniqueOccurrenceIds;
}
public int getRecordsMissingOccurrenceId() {
return recordsMissingOccurrenceId;
}
public boolean isAllRecordsChecked() {
return allRecordsChecked;
}
public String getInvalidationReason() {
return invalidationReason;
}
public boolean isValid() {
return valid;
}
@Override
public int hashCode() {
return Objects.hashCode(checkedRecords, uniqueTriplets, recordsWithInvalidTriplets, uniqueOccurrenceIds,
recordsMissingOccurrenceId, allRecordsChecked, valid);
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null || getClass() != obj.getClass()) {
return false;
}
final OccurrenceValidationReport other = (OccurrenceValidationReport) obj;
return Objects.equal(this.checkedRecords, other.checkedRecords)
&& Objects.equal(this.uniqueTriplets, other.uniqueTriplets)
&& Objects.equal(this.recordsWithInvalidTriplets, other.recordsWithInvalidTriplets)
&& Objects.equal(this.uniqueOccurrenceIds, other.uniqueOccurrenceIds)
&& Objects.equal(this.recordsMissingOccurrenceId, other.recordsMissingOccurrenceId)
&& Objects.equal(this.allRecordsChecked, other.allRecordsChecked)
&& Objects.equal(this.valid, other.valid);
}
@Override
public String toString() {
return Objects.toStringHelper(this).add("checkedRecords", checkedRecords)
.add("uniqueTriplets", uniqueTriplets).add("recordsWithInvalidTriplets", recordsWithInvalidTriplets)
.add("uniqueOccurrenceIds", uniqueOccurrenceIds).add("recordsMissingOccurrenceId", recordsMissingOccurrenceId)
.add("allRecordsChecked", allRecordsChecked).add("invalidationReason", invalidationReason).add("valid", valid)
.toString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment